diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 597ce5120c..b156654973 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -5,12 +5,10 @@ package charset import ( "bytes" - "fmt" "io" "strings" "unicode/utf8" - "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" @@ -23,60 +21,39 @@ import ( var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} type ConvertOpts struct { - KeepBOM bool + KeepBOM bool + ErrorReplacement []byte + ErrorReturnOrigin bool } +var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024 + // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { - buf := make([]byte, 2048) + buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize) n, err := util.ReadAtMost(rd, buf) if err != nil { - return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) - } - - charsetLabel, err := DetectEncoding(buf[:n]) - if err != nil || charsetLabel == "UTF-8" { - return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) - } - - encoding, _ := charset.Lookup(charsetLabel) - if encoding == nil { + // read error occurs, don't do any processing return io.MultiReader(bytes.NewReader(buf[:n]), rd) } - return transform.NewReader( - io.MultiReader( - bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), - rd, - ), - encoding.NewDecoder(), - ) -} - -// ToUTF8 converts content to UTF8 encoding -func ToUTF8(content []byte, opts ConvertOpts) (string, error) { - charsetLabel, err := DetectEncoding(content) - if err != nil { - return "", err - } else if charsetLabel == "UTF-8" { - return string(MaybeRemoveBOM(content, opts)), nil + charsetLabel, _ := DetectEncoding(buf[:n]) + if charsetLabel == "UTF-8" { + // is utf-8, try to remove BOM and read it as-is + return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd) } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { - return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) + // unknown charset, don't do any processing + return io.MultiReader(bytes.NewReader(buf[:n]), rd) } - // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't lose much data. - result, n, err := transform.Bytes(encoding.NewDecoder(), content) - if err != nil { - result = append(result, content[n:]...) - } - - result = MaybeRemoveBOM(result, opts) - - return string(result), err + // convert from charset to utf-8 + return transform.NewReader( + io.MultiReader(bytes.NewReader(buf[:n]), rd), + encoding.NewDecoder(), + ) } // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible @@ -85,73 +62,84 @@ func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte { return bs } -// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible -func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte { - charsetLabel, err := DetectEncoding(content) - if err != nil || charsetLabel == "UTF-8" { - return MaybeRemoveBOM(content, opts) +func ToUTF8DropErrors(content []byte) []byte { + return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}}) +} + +func ToUTF8(content []byte, opts ConvertOpts) []byte { + charsetLabel, _ := DetectEncoding(content) + if charsetLabel == "UTF-8" { + return maybeRemoveBOM(content, opts) } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { + setting.PanicInDevOrTesting("unsupported detected charset %q, it shouldn't happen", charsetLabel) return content } - // We ignore any non-decodable parts from the file. - // Some parts might be lost var decoded []byte decoder := encoding.NewDecoder() idx := 0 - for { + for idx < len(content) { result, n, err := transform.Bytes(decoder, content[idx:]) decoded = append(decoded, result...) if err == nil { break } - decoded = append(decoded, ' ') - idx = idx + n + 1 - if idx >= len(content) { - break + if opts.ErrorReturnOrigin { + return content } + if opts.ErrorReplacement == nil { + decoded = append(decoded, content[idx+n]) + } else { + decoded = append(decoded, opts.ErrorReplacement...) + } + idx += n + 1 } - - return MaybeRemoveBOM(decoded, opts) + return maybeRemoveBOM(decoded, opts) } -// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false -func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte { +// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false +func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte { if opts.KeepBOM { return content } - if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { - return content[3:] - } - return content + return bytes.TrimPrefix(content, UTF8BOM) } // DetectEncoding detect the encoding of content -func DetectEncoding(content []byte) (string, error) { +// it always returns a detected or guessed "encoding" string, no matter error happens or not +func DetectEncoding(content []byte) (encoding string, _ error) { // First we check if the content represents valid utf8 content excepting a truncated character at the end. // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do - // instead we walk backwards from the end to trim off a the incomplete character + // instead we walk backwards from the end to trim off the incomplete character toValidate := content end := len(toValidate) - 1 - if end < 0 { - // no-op - } else if toValidate[end]>>5 == 0b110 { - // Incomplete 1 byte extension e.g. © which has been truncated to - toValidate = toValidate[:end] - } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { - // Incomplete 2 byte extension e.g. ⛔ <9b><94> which has been truncated to <9b> - toValidate = toValidate[:end-1] - } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { - // Incomplete 3 byte extension e.g. 💩 <9f><92> which has been truncated to <9f><92> - toValidate = toValidate[:end-2] + // U+0000 U+007F 0yyyzzzz + // U+0080 U+07FF 110xxxyy 10yyzzzz + // U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz + // U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz + cnt := 0 + for end >= 0 && cnt < 4 { + c := toValidate[end] + if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 { + // a leading byte + toValidate = toValidate[:end] + break + } else if c>>6 == 0b10 { + // a continuation byte + end-- + } else { + // not an utf-8 byte + break + } + cnt++ } + if utf8.Valid(toValidate) { - log.Debug("Detected encoding: utf-8 (fast)") return "UTF-8", nil } @@ -160,7 +148,7 @@ func DetectEncoding(content []byte) (string, error) { if len(content) < 1024 { // Check if original content is valid if _, err := textDetector.DetectBest(content); err != nil { - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } times := 1024 / len(content) detectContent = make([]byte, 0, times*len(content)) @@ -171,14 +159,10 @@ func DetectEncoding(content []byte) (string, error) { detectContent = content } - // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break results, err := textDetector.DetectAll(detectContent) if err != nil { - if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) - return setting.Repository.AnsiCharset, nil - } - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } topConfidence := results[0].Confidence @@ -201,11 +185,9 @@ func DetectEncoding(content []byte) (string, error) { } // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument - if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" { return setting.Repository.AnsiCharset, err } - log.Debug("Detected encoding: %s", topResult.Charset) - return topResult.Charset, err + return topResult.Charset, nil } diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index cd2e3b9aaa..0314abc347 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -4,108 +4,89 @@ package charset import ( - "bytes" "io" + "os" "strings" "testing" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" "github.com/stretchr/testify/assert" ) -func resetDefaultCharsetsOrder() { - defaultDetectedCharsetsOrder := make([]string, 0, len(setting.Repository.DetectedCharsetsOrder)) - for _, charset := range setting.Repository.DetectedCharsetsOrder { - defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset))) - } +func TestMain(m *testing.M) { setting.Repository.DetectedCharsetScore = map[string]int{} - i := 0 - for _, charset := range defaultDetectedCharsetsOrder { - canonicalCharset := strings.ToLower(strings.TrimSpace(charset)) - if _, has := setting.Repository.DetectedCharsetScore[canonicalCharset]; !has { - setting.Repository.DetectedCharsetScore[canonicalCharset] = i - i++ - } + for i, charset := range setting.Repository.DetectedCharsetsOrder { + setting.Repository.DetectedCharsetScore[strings.ToLower(charset)] = i } + os.Exit(m.Run()) } func TestMaybeRemoveBOM(t *testing.T) { - res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) - res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) } func TestToUTF8(t *testing.T) { - resetDefaultCharsetsOrder() - // Note: golang compiler seems so behave differently depending on the current // locale, so some conversions might behave differently. For that reason, we don't // depend on particular conversions but in expected behaviors. - res, err := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, "ABC", res) + res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + assert.Equal(t, "ABC", string(res)) // "áéíóú" - res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // "áéíóú" - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba, }, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e, }, ConvertOpts{}) - assert.NoError(t, err) stringMustStartWith(t, "Hola,", res) stringMustEndWith(t, "AAA.", res) // Japanese (Shift-JIS) // 日属秘ぞしちゅ。 - res, err = ToUTF8([]byte{ + res = ToUTF8([]byte{ 0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42, }, ConvertOpts{}) - assert.NoError(t, err) assert.Equal(t, []byte{ 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, - }, - []byte(res)) + }, res) - res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) - assert.NoError(t, err) - assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res)) + res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) } func TestToUTF8WithFallback(t *testing.T) { - resetDefaultCharsetsOrder() // "ABC" res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) @@ -152,54 +133,58 @@ func TestToUTF8WithFallback(t *testing.T) { } func TestToUTF8DropErrors(t *testing.T) { - resetDefaultCharsetsOrder() // "ABC" - res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) + res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}) assert.Equal(t, []byte{0x41, 0x42, 0x43}, res) // "áéíóú" - res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // UTF8 BOM + "áéíóú" - res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}) assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // "Hola, así cómo ños" - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}) assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8]) assert.Equal(t, []byte{0x73}, res[len(res)-1:]) // "Hola, así cómo " minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20} - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}) // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those assert.Equal(t, minmatch, res[0:len(minmatch)]) - res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}) // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those assert.Equal(t, minmatch, res[0:len(minmatch)]) // Japanese (Shift-JIS) // "日属秘ぞしちゅ。" - res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}) assert.Equal(t, []byte{ 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, }, res) - res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) + res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}) assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) } func TestDetectEncoding(t *testing.T) { - resetDefaultCharsetsOrder() testSuccess := func(b []byte, expected string) { encoding, err := DetectEncoding(b) assert.NoError(t, err) assert.Equal(t, expected, encoding) } + + // invalid bytes + encoding, err := DetectEncoding([]byte{0xfa}) + assert.Error(t, err) + assert.Equal(t, "UTF-8", encoding) + // utf-8 b := []byte("just some ascii") testSuccess(b, "UTF-8") @@ -214,169 +199,49 @@ func TestDetectEncoding(t *testing.T) { // iso-8859-1: dcor b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a} - encoding, err := DetectEncoding(b) + encoding, err = DetectEncoding(b) assert.NoError(t, err) assert.Contains(t, encoding, "ISO-8859-1") - old := setting.Repository.AnsiCharset - setting.Repository.AnsiCharset = "placeholder" - defer func() { - setting.Repository.AnsiCharset = old - }() - testSuccess(b, "placeholder") - - // invalid bytes - b = []byte{0xfa} - _, err = DetectEncoding(b) - assert.Error(t, err) + defer test.MockVariableValue(&setting.Repository.AnsiCharset, "MyEncoding")() + testSuccess(b, "MyEncoding") } -func stringMustStartWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[:len(expected)]) +func stringMustStartWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[:len(expected)])) } -func stringMustEndWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[len(value)-len(expected):]) +func stringMustEndWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[len(value)-len(expected):])) } func TestToUTF8WithFallbackReader(t *testing.T) { - resetDefaultCharsetsOrder() + test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize) - for testLen := range 2048 { - pattern := " test { () }\n" - input := "" - for len(input) < testLen { - input += pattern - } - input = input[:testLen] - input += "// Выключаем" - rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{}) + block := "aá啊🤔" + runes := []rune(block) + assert.Len(t, string(runes[0]), 1) + assert.Len(t, string(runes[1]), 2) + assert.Len(t, string(runes[2]), 3) + assert.Len(t, string(runes[3]), 4) + + content := strings.Repeat(block, 2) + for i := 1; i < len(content); i++ { + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) + assert.Equal(t, "UTF-8", encoding) + + ToUTF8WithFallbackReaderPrefetchSize = i + rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{}) r, _ := io.ReadAll(rd) - assert.Equalf(t, input, string(r), "testing string len=%d", testLen) + assert.Equal(t, content, string(r)) + } + for _, r := range runes { + content = "abc abc " + string(r) + string(r) + string(r) + for i := 0; i < len(content); i++ { + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) + assert.Equal(t, "UTF-8", encoding) + } } - - truncatedOneByteExtension := failFastBytes - encoding, _ := DetectEncoding(truncatedOneByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedTwoByteExtension := failFastBytes - truncatedTwoByteExtension[len(failFastBytes)-1] = 0x9b - truncatedTwoByteExtension[len(failFastBytes)-2] = 0xe2 - - encoding, _ = DetectEncoding(truncatedTwoByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedThreeByteExtension := failFastBytes - truncatedThreeByteExtension[len(failFastBytes)-1] = 0x92 - truncatedThreeByteExtension[len(failFastBytes)-2] = 0x9f - truncatedThreeByteExtension[len(failFastBytes)-3] = 0xf0 - - encoding, _ = DetectEncoding(truncatedThreeByteExtension) - assert.Equal(t, "UTF-8", encoding) -} - -var failFastBytes = []byte{ - 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x74, 0x6f, - 0x6f, 0x6c, 0x73, 0x2e, 0x61, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x64, 0x65, 0x66, 0x73, 0x2e, 0x63, 0x6f, 0x6e, - 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x4f, 0x73, 0x0a, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, - 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, - 0x74, 0x2e, 0x67, 0x72, 0x61, 0x64, 0x6c, 0x65, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x75, 0x6e, 0x2e, 0x42, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x0a, 0x0a, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x64, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, - 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x64, 0x65, 0x70, 0x65, - 0x6e, 0x64, 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, - 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x2d, 0x64, 0x6f, 0x63, 0x73, 0x22, 0x29, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x64, 0x62, - 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x66, - 0x73, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, - 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x6d, 0x71, 0x22, 0x29, 0x29, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, - 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, - 0x2d, 0x61, 0x75, 0x74, 0x68, 0x2d, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, - 0x65, 0x72, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, - 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x68, 0x61, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x22, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, - 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x77, 0x65, 0x62, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, - 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, - 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x6f, 0x70, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, - 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, - 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x63, 0x74, 0x75, 0x61, 0x74, 0x6f, 0x72, 0x22, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, - 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, - 0x6c, 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x73, 0x74, 0x72, 0x61, 0x70, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x73, 0x75, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x73, 0x6c, 0x65, 0x75, 0x74, 0x68, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, - 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x72, 0x65, 0x74, 0x72, 0x79, 0x3a, - 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x72, 0x65, 0x74, 0x72, 0x79, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x63, 0x68, 0x2e, 0x71, - 0x6f, 0x73, 0x2e, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x3a, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x2d, 0x63, - 0x6c, 0x61, 0x73, 0x73, 0x69, 0x63, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, - 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x69, 0x6f, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, - 0x74, 0x65, 0x72, 0x3a, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x67, 0x69, 0x73, - 0x74, 0x72, 0x79, 0x2d, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6b, 0x6f, 0x74, - 0x6c, 0x69, 0x6e, 0x28, 0x22, 0x73, 0x74, 0x64, 0x6c, 0x69, 0x62, 0x22, 0x29, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64, - 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x65, 0x73, 0x74, 0x49, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, - 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, - 0x74, 0x65, 0x73, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, - 0x61, 0x72, 0x20, 0x62, 0x79, 0x20, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, - 0x69, 0x6e, 0x67, 0x28, 0x4a, 0x61, 0x72, 0x3a, 0x3a, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x61, 0x72, 0x63, 0x68, 0x69, 0x76, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x66, 0x69, 0x65, 0x72, 0x2e, - 0x73, 0x65, 0x74, 0x28, 0x22, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x76, 0x61, 0x6c, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, - 0x20, 0x62, 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x67, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x6e, 0x69, 0x66, 0x65, 0x73, 0x74, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x73, 0x28, 0x22, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x2d, 0x50, 0x61, 0x74, 0x68, 0x22, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, - 0x20, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x2f, 0x2b, 0x22, 0x2e, 0x74, 0x6f, 0x52, 0x65, 0x67, 0x65, 0x78, 0x28, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, - 0x65, 0x20, 0x66, 0x75, 0x6e, 0x20, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x3a, 0x20, 0x53, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, - 0x61, 0x74, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x54, 0x6f, 0x53, 0x74, 0x72, 0x69, - 0x6e, 0x67, 0x28, 0x22, 0x20, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x74, 0x2e, 0x74, 0x6f, 0x55, 0x52, 0x49, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x55, - 0x52, 0x4c, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x46, 0x69, 0x72, 0x73, 0x74, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x22, 0x2f, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x74, 0x61, 0x73, - 0x6b, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3c, 0x42, 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x3e, 0x28, 0x22, 0x62, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x4f, - 0x73, 0x2e, 0x69, 0x73, 0x46, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x28, 0x4f, 0x73, 0x2e, 0x46, 0x41, 0x4d, 0x49, 0x4c, 0x59, - 0x5f, 0x57, 0x49, 0x4e, 0x44, 0x4f, 0x57, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x28, 0x73, - 0x6f, 0x75, 0x72, 0x63, 0x65, 0x53, 0x65, 0x74, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x28, 0x22, 0x6d, 0x61, 0x69, - 0x6e, 0x22, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x20, 0x7b, 0x20, 0x69, 0x74, 0x2e, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, - 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0xd0, } diff --git a/modules/httplib/serve.go b/modules/httplib/serve.go index b4c5e7fe1e..2d66a86a8b 100644 --- a/modules/httplib/serve.go +++ b/modules/httplib/serve.go @@ -19,7 +19,6 @@ import ( charsetModule "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/httpcache" - "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/typesniffer" "code.gitea.io/gitea/modules/util" @@ -109,11 +108,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, mineBuf []byt } if isPlain { - charset, err := charsetModule.DetectEncoding(mineBuf) - if err != nil { - log.Error("Detect raw file %s charset failed: %v, using by default utf-8", opts.Filename, err) - charset = "utf-8" - } + charset, _ := charsetModule.DetectEncoding(mineBuf) opts.ContentTypeCharset = strings.ToLower(charset) } diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index bdb477ce6e..5f6a7f6082 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -203,7 +203,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro RepoID: repo.ID, CommitID: commitSha, Filename: update.Filename, - Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + Content: string(charset.ToUTF8DropErrors(fileContents)), Language: analyze.GetCodeLanguage(update.Filename, fileContents), UpdatedAt: time.Now().UTC(), }) diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index b2eb301a5d..a7027051d2 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -191,7 +191,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro Doc(map[string]any{ "repo_id": repo.ID, "filename": update.Filename, - "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), + "content": string(charset.ToUTF8DropErrors(fileContents)), "commit_id": sha, "language": analyze.GetCodeLanguage(update.Filename, fileContents), "updated_at": timeutil.TimeStampNow(), diff --git a/modules/setting/setting.go b/modules/setting/setting.go index e14997801f..dc60d99bd6 100644 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -240,4 +240,5 @@ func PanicInDevOrTesting(msg string, a ...any) { if !IsProd || IsInTesting { panic(fmt.Sprintf(msg, a...)) } + log.Error(msg, a...) } diff --git a/routers/web/repo/editor.go b/routers/web/repo/editor.go index 983249a6d2..048c9f3d4a 100644 --- a/routers/web/repo/editor.go +++ b/routers/web/repo/editor.go @@ -317,11 +317,7 @@ func EditFile(ctx *context.Context) { ctx.ServerError("ReadAll", err) return } - if content, err := charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true}); err != nil { - ctx.Data["FileContent"] = string(buf) - } else { - ctx.Data["FileContent"] = content - } + ctx.Data["FileContent"] = string(charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true, ErrorReturnOrigin: true})) } } diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 6e15f71609..f8fde6ab29 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -835,11 +835,11 @@ parsingLoop: if buffer.Len() == 0 { continue } - charsetLabel, err := charset.DetectEncoding(buffer.Bytes()) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := stdcharset.Lookup(charsetLabel) - if encoding != nil { - diffLineTypeDecoders[lineType] = encoding.NewDecoder() + charsetLabel, _ := charset.DetectEncoding(buffer.Bytes()) + if charsetLabel != "UTF-8" { + charsetEncoding, _ := stdcharset.Lookup(charsetLabel) + if charsetEncoding != nil { + diffLineTypeDecoders[lineType] = charsetEncoding.NewDecoder() } } } @@ -1325,10 +1325,10 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit shouldFullFileHighlight := !setting.Git.DisableDiffHighlight && attrDiff.Value() == "" if shouldFullFileHighlight { if limitedContent.LeftContent != nil && limitedContent.LeftContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.String()) + diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.Bytes()) } if limitedContent.RightContent != nil && limitedContent.RightContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.String()) + diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.Bytes()) } } } @@ -1336,9 +1336,34 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit return diff, nil } -func highlightCodeLines(diffFile *DiffFile, isLeft bool, content string) map[int]template.HTML { +func splitHighlightLines(buf []byte) (ret [][]byte) { + lineCount := bytes.Count(buf, []byte("\n")) + 1 + ret = make([][]byte, 0, lineCount) + nlTagClose := []byte("\n" right after \n, sometimes before. + // * "text\n" + // * "text\n" + if bytes.HasPrefix(buf[pos:], nlTagClose) { + pos1 := bytes.IndexByte(buf[pos:], '>') + if pos1 != -1 { + pos += pos1 + } + } + ret = append(ret, buf[:pos+1]) + buf = buf[pos+1:] + } +} + +func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { + content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) - splitLines := strings.Split(string(highlightedNewContent), "\n") + splitLines := splitHighlightLines([]byte(highlightedNewContent)) lines := make(map[int]template.HTML, len(splitLines)) // only save the highlighted lines we need, but not the whole file, to save memory for _, sec := range diffFile.Sections { diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index 721ae0dfc7..a94dad8b63 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -5,6 +5,7 @@ package gitdiff import ( + "html/template" "strconv" "strings" "testing" @@ -1106,3 +1107,41 @@ func TestDiffLine_GetExpandDirection(t *testing.T) { assert.Equal(t, c.direction, c.diffLine.GetExpandDirection(), "case %s expected direction: %s", c.name, c.direction) } } + +func TestHighlightCodeLines(t *testing.T) { + t.Run("CharsetDetecting", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{{LeftIdx: 1}}, + }, + }, + } + ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes + assert.Equal(t, "// abcÌ defÍ\n", string(ret[0])) + }) + + t.Run("LeftLines", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{ + {LeftIdx: 1}, + {LeftIdx: 2}, + {LeftIdx: 3}, + }, + }, + }, + } + const nl = "\n" + ret := highlightCodeLines(diffFile, true, []byte("a\nb\n")) + assert.Equal(t, map[int]template.HTML{ + 0: `a` + nl, + 1: `b`, + }, ret) + }) +} diff --git a/services/gitdiff/highlightdiff_test.go b/services/gitdiff/highlightdiff_test.go index aebe38ae7c..0df2e29d13 100644 --- a/services/gitdiff/highlightdiff_test.go +++ b/services/gitdiff/highlightdiff_test.go @@ -25,12 +25,12 @@ func TestDiffWithHighlight(t *testing.T) { t.Run("CleanUp", func(t *testing.T) { hcd := newHighlightCodeDiff() - codeA := template.HTML(`this is updated comment`) + codeA := template.HTML(`this is a comment`) + codeB := template.HTML(`this is updated comment`) outDel := hcd.diffLineWithHighlight(DiffLineDel, codeA, codeB) - assert.Equal(t, `a comment`, string(outDel)) + assert.Equal(t, `this is a comment`, string(outDel)) outAdd := hcd.diffLineWithHighlight(DiffLineAdd, codeA, codeB) - assert.Equal(t, `updated comment`, string(outAdd)) + assert.Equal(t, `this is updated comment`, string(outAdd)) }) t.Run("OpenCloseTags", func(t *testing.T) { diff --git a/tests/integration/migration-test/migration_test.go b/tests/integration/migration-test/migration_test.go index 5fa7cbbfb7..2659c5c53d 100644 --- a/tests/integration/migration-test/migration_test.go +++ b/tests/integration/migration-test/migration_test.go @@ -4,6 +4,7 @@ package migrations import ( + "bytes" "compress/gzip" "context" "database/sql" @@ -21,7 +22,6 @@ import ( "code.gitea.io/gitea/models/migrations" migrate_base "code.gitea.io/gitea/models/migrations/base" "code.gitea.io/gitea/models/unittest" - "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" @@ -108,11 +108,11 @@ func readSQLFromFile(version string) (string, error) { } defer gr.Close() - bytes, err := io.ReadAll(gr) + buf, err := io.ReadAll(gr) if err != nil { return "", err } - return string(charset.MaybeRemoveBOM(bytes, charset.ConvertOpts{})), nil + return string(bytes.TrimPrefix(buf, []byte{'\xef', '\xbb', '\xbf'})), nil } func restoreOldDB(t *testing.T, version string) {