From 89bfddc5c25e757ebf3c287077f329dde95822f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= <14005567+mrtz-j@users.noreply.github.com> Date: Sun, 25 Jan 2026 15:02:16 +0100 Subject: [PATCH] Normalize guessed languages for code highlighting (#36450) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For when Enry correctly recognized the language, but returns the language name in a way that isn't recognized by chroma. Resolves https://github.com/go-gitea/gitea/issues/22443 --------- Co-authored-by: Moritz Jörg Co-authored-by: wxiaoguang --- custom/conf/app.example.ini | 5 +- modules/analyze/code_language.go | 5 +- modules/highlight/highlight.go | 186 ++++++++++++++-------------- modules/highlight/highlight_test.go | 37 +++++- modules/indexer/code/search.go | 4 +- modules/markup/orgmode/orgmode.go | 38 ++---- routers/init.go | 2 - routers/web/repo/blame.go | 2 +- routers/web/repo/view_file.go | 6 +- services/gitdiff/gitdiff.go | 4 +- 10 files changed, 148 insertions(+), 141 deletions(-) diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini index a1228f8dbf..3eaffde970 100644 --- a/custom/conf/app.example.ini +++ b/custom/conf/app.example.ini @@ -2485,8 +2485,9 @@ LEVEL = Info ;[highlight.mapping] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; Extension mapping to highlight class -;; e.g. .toml=ini +;; Extension mapping to highlight class, for example: +;; .toml = ini +;; .my-js = JavaScript ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/modules/analyze/code_language.go b/modules/analyze/code_language.go index 74e7a06d06..d8589861d3 100644 --- a/modules/analyze/code_language.go +++ b/modules/analyze/code_language.go @@ -4,12 +4,13 @@ package analyze import ( - "path/filepath" + "path" "github.com/go-enry/go-enry/v2" ) // GetCodeLanguage detects code language based on file name and content +// It can be slow when the content is used for detection func GetCodeLanguage(filename string, content []byte) string { if language, ok := enry.GetLanguageByExtension(filename); ok { return language @@ -23,5 +24,5 @@ func GetCodeLanguage(filename string, content []byte) string { return enry.OtherLanguage } - return enry.GetLanguage(filepath.Base(filename), content) + return enry.GetLanguage(path.Base(filename), content) } diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go index 2b13e9c4ce..68f523c6ca 100644 --- a/modules/highlight/highlight.go +++ b/modules/highlight/highlight.go @@ -12,7 +12,6 @@ import ( "html/template" "io" "path" - "path/filepath" "strings" "sync" @@ -25,35 +24,32 @@ import ( "github.com/alecthomas/chroma/v2/formatters/html" "github.com/alecthomas/chroma/v2/lexers" "github.com/alecthomas/chroma/v2/styles" - lru "github.com/hashicorp/golang-lru/v2" + "github.com/go-enry/go-enry/v2" ) // don't index files larger than this many bytes for performance purposes const sizeLimit = 1024 * 1024 +type globalVarsType struct { + highlightMapping map[string]string + githubStyles *chroma.Style +} + var ( - // For custom user mapping - highlightMapping = map[string]string{} - - once sync.Once - - cache *lru.TwoQueueCache[string, any] - - githubStyles = styles.Get("github") + globalVarsMu sync.Mutex + globalVarsPtr *globalVarsType ) -// NewContext loads custom highlight map from local config -func NewContext() { - once.Do(func() { - highlightMapping = setting.GetHighlightMapping() - - // The size 512 is simply a conservative rule of thumb - c, err := lru.New2Q[string, any](512) - if err != nil { - panic(fmt.Sprintf("failed to initialize LRU cache for highlighter: %s", err)) - } - cache = c - }) +func globalVars() *globalVarsType { + // in the future, the globalVars might need to be re-initialized when settings change, so don't use sync.Once here + globalVarsMu.Lock() + defer globalVarsMu.Unlock() + if globalVarsPtr == nil { + globalVarsPtr = &globalVarsType{} + globalVarsPtr.githubStyles = styles.Get("github") + globalVarsPtr.highlightMapping = setting.GetHighlightMapping() + } + return globalVarsPtr } // UnsafeSplitHighlightedLines splits highlighted code into lines preserving HTML tags @@ -88,10 +84,56 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) { } } -// Code returns an HTML version of code string with chroma syntax highlighting classes and the matched lexer name -func Code(fileName, language, code string) (output template.HTML, lexerName string) { - NewContext() +func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer { + lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters? + ext := path.Ext(fileName) + // the "lang" might come from enry, it has different naming for some languages + switch lang { + case "F#": + lang = "FSharp" + case "Pascal": + lang = "ObjectPascal" + case "C": + if ext == ".C" || ext == ".H" { + lang = "C++" + } + } + // lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers + return lexers.Get(lang) +} +// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional. +// When code content is provided, it will be slow if no lexer is found by file name or language. +// If no lexer is found, it will return the fallback lexer. +func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) { + if lang != "" { + lexer = getChromaLexerByLanguage(fileName, lang) + } + + if lexer == nil { + fileExt := path.Ext(fileName) + if val, ok := globalVars().highlightMapping[fileExt]; ok { + lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer + } + } + + if lexer == nil { + // when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content + // this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages + enryLanguage := analyze.GetCodeLanguage(fileName, code) + lexer = getChromaLexerByLanguage(fileName, enryLanguage) + if lexer == nil { + if enryLanguage != enry.OtherLanguage { + log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName) + } + lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname + } + } + + return util.IfZero(lexer, lexers.Fallback) +} + +func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) { // diff view newline will be passed as empty, change to literal '\n' so it can be copied // preserve literal newline in blame view if code == "" || code == "\n" { @@ -102,45 +144,25 @@ func Code(fileName, language, code string) (output template.HTML, lexerName stri return template.HTML(template.HTMLEscapeString(code)), "" } - var lexer chroma.Lexer - - if len(language) > 0 { - lexer = lexers.Get(language) - - if lexer == nil { - // Attempt stripping off the '?' - if before, _, ok := strings.Cut(language, "?"); ok { - lexer = lexers.Get(before) - } - } + var codeForGuessLexer []byte + if slowGuess { + // it is slower to guess lexer by code content, so only do it when necessary + codeForGuessLexer = util.UnsafeStringToBytes(code) } - - if lexer == nil { - if val, ok := highlightMapping[path.Ext(fileName)]; ok { - // use mapped value to find lexer - lexer = lexers.Get(val) - } - } - - if lexer == nil { - if l, ok := cache.Get(fileName); ok { - lexer = l.(chroma.Lexer) - } - } - - if lexer == nil { - lexer = lexers.Match(fileName) - if lexer == nil { - lexer = lexers.Fallback - } - cache.Add(fileName, lexer) - } - - return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name) + lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer) + return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name) } -// CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes -func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML { +func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) { + return renderCode(fileName, language, code, false) +} + +func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) { + return renderCode(fileName, language, code, true) +} + +// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes +func RenderCodeByLexer(lexer chroma.Lexer, code string) template.HTML { formatter := html.New(html.WithClasses(true), html.WithLineNumbers(false), html.PreventSurroundingPre(true), @@ -155,7 +177,7 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML { return template.HTML(template.HTMLEscapeString(code)) } // style not used for live site but need to pass something - err = formatter.Format(htmlw, githubStyles, iterator) + err = formatter.Format(htmlw, globalVars().githubStyles, iterator) if err != nil { log.Error("Can't format code: %v", err) return template.HTML(template.HTMLEscapeString(code)) @@ -167,12 +189,10 @@ func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML { return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n")) } -// File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name -func File(fileName, language string, code []byte) ([]template.HTML, string, error) { - NewContext() - +// RenderFullFile returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name +func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, string, error) { if len(code) > sizeLimit { - return PlainText(code), "", nil + return RenderPlainText(code), "", nil } formatter := html.New(html.WithClasses(true), @@ -180,31 +200,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro html.PreventSurroundingPre(true), ) - var lexer chroma.Lexer - - // provided language overrides everything - if language != "" { - lexer = lexers.Get(language) - } - - if lexer == nil { - if val, ok := highlightMapping[filepath.Ext(fileName)]; ok { - lexer = lexers.Get(val) - } - } - - if lexer == nil { - guessLanguage := analyze.GetCodeLanguage(fileName, code) - - lexer = lexers.Get(guessLanguage) - if lexer == nil { - lexer = lexers.Match(fileName) - if lexer == nil { - lexer = lexers.Fallback - } - } - } - + lexer := GetChromaLexerWithFallback(fileName, language, code) lexerName := formatLexerName(lexer.Config().Name) iterator, err := lexer.Tokenise(nil, string(code)) @@ -218,7 +214,7 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro lines := make([]template.HTML, 0, len(tokensLines)) for _, tokens := range tokensLines { iterator = chroma.Literator(tokens...) - err = formatter.Format(htmlBuf, githubStyles, iterator) + err = formatter.Format(htmlBuf, globalVars().githubStyles, iterator) if err != nil { return nil, "", fmt.Errorf("can't format code: %w", err) } @@ -229,8 +225,8 @@ func File(fileName, language string, code []byte) ([]template.HTML, string, erro return lines, lexerName, nil } -// PlainText returns non-highlighted HTML for code -func PlainText(code []byte) []template.HTML { +// RenderPlainText returns non-highlighted HTML for code +func RenderPlainText(code []byte) []template.HTML { r := bufio.NewReader(bytes.NewReader(code)) m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1) for { diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go index 52873427a8..f4bdedb2a0 100644 --- a/modules/highlight/highlight_test.go +++ b/modules/highlight/highlight_test.go @@ -112,7 +112,7 @@ c=2 for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - out, lexerName, err := File(tt.name, "", []byte(tt.code)) + out, lexerName, err := RenderFullFile(tt.name, "", []byte(tt.code)) assert.NoError(t, err) assert.Equal(t, tt.want, out) assert.Equal(t, tt.lexerName, lexerName) @@ -176,7 +176,7 @@ c=2`), for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - out := PlainText([]byte(tt.code)) + out := RenderPlainText([]byte(tt.code)) assert.Equal(t, tt.want, out) }) } @@ -199,3 +199,36 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) { assert.Equal(t, "a\n", string(ret[0])) assert.Equal(t, "b\n", string(ret[1])) } + +func TestGetChromaLexer(t *testing.T) { + globalVars().highlightMapping[".my-html"] = "HTML" + t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") }) + + cases := []struct { + fileName string + language string + content string + expected string + }{ + {"test.py", "", "", "Python"}, + + {"any-file", "javascript", "", "JavaScript"}, + {"any-file", "", "/* vim: set filetype=python */", "Python"}, + {"any-file", "", "", "fallback"}, + + {"test.fs", "", "", "Forth"}, + {"test.fs", "F#", "", "FSharp"}, + {"test.fs", "", "let x = 1", "FSharp"}, + + {"test.c", "", "", "C"}, + {"test.C", "", "", "C++"}, + {"OLD-CODE.PAS", "", "", "ObjectPascal"}, + {"test.my-html", "", "", "HTML"}, + } + for _, c := range cases { + lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content)) + if assert.NotNil(t, lexer, "case: %+v", c) { + assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c) + } + } +} diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index a7a5d7d2e3..907dd1a537 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -72,10 +72,10 @@ func writeStrings(buf *bytes.Buffer, strs ...string) error { func HighlightSearchResultCode(filename, language string, lineNums []int, code string) []*ResultLine { // we should highlight the whole code block first, otherwise it doesn't work well with multiple line highlighting - hl, _ := highlight.Code(filename, language, code) + hl, _ := highlight.RenderCodeFast(filename, language, code) highlightedLines := strings.Split(string(hl), "\n") - // The lineNums outputted by highlight.Code might not match the original lineNums, because "highlight" removes the last `\n` + // The lineNums outputted by render might not match the original lineNums, because "highlight" removes the last `\n` lines := make([]*ResultLine, min(len(highlightedLines), len(lineNums))) for i := range lines { lines[i] = &ResultLine{ diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go index 93c335d244..ff2b7600eb 100644 --- a/modules/markup/orgmode/orgmode.go +++ b/modules/markup/orgmode/orgmode.go @@ -5,7 +5,6 @@ package orgmode import ( "fmt" - "html" "html/template" "io" "strings" @@ -17,7 +16,6 @@ import ( "code.gitea.io/gitea/modules/setting" "github.com/alecthomas/chroma/v2" - "github.com/alecthomas/chroma/v2/lexers" "github.com/niklasfasching/go-org/org" ) @@ -57,40 +55,20 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error htmlWriter.HighlightCodeBlock = func(source, lang string, inline bool, params map[string]string) string { defer func() { if err := recover(); err != nil { + // catch the panic, log the error and return empty result log.Error("Panic in HighlightCodeBlock: %v\n%s", err, log.Stack(2)) - panic(err) } }() - w := &strings.Builder{} - lexer := lexers.Get(lang) - if lexer == nil && lang == "" { - lexer = lexers.Analyse(source) - if lexer == nil { - lexer = lexers.Fallback - } - lang = strings.ToLower(lexer.Config().Name) - } + lexer := highlight.GetChromaLexerWithFallback("", lang, nil) // don't use content to detect, it is too slow + lexer = chroma.Coalesce(lexer) + sb := &strings.Builder{} // include language-x class as part of commonmark spec - if err := ctx.RenderInternal.FormatWithSafeAttrs(w, `
`, lang); err != nil {
-			return ""
-		}
-		if lexer == nil {
-			if _, err := w.WriteString(html.EscapeString(source)); err != nil {
-				return ""
-			}
-		} else {
-			lexer = chroma.Coalesce(lexer)
-			if _, err := w.WriteString(string(highlight.CodeFromLexer(lexer, source))); err != nil {
-				return ""
-			}
-		}
-		if _, err := w.WriteString("
"); err != nil { - return "" - } - - return w.String() + _ = ctx.RenderInternal.FormatWithSafeAttrs(sb, `
`, strings.ToLower(lexer.Config().Name))
+		_, _ = sb.WriteString(string(highlight.RenderCodeByLexer(lexer, source)))
+		_, _ = sb.WriteString("
") + return sb.String() } w := &orgWriter{rctx: ctx, HTMLWriter: htmlWriter} diff --git a/routers/init.go b/routers/init.go index 3af5f9f510..82a5378263 100644 --- a/routers/init.go +++ b/routers/init.go @@ -15,7 +15,6 @@ import ( "code.gitea.io/gitea/modules/eventsource" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/git/gitcmd" - "code.gitea.io/gitea/modules/highlight" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/markup" "code.gitea.io/gitea/modules/markup/external" @@ -131,7 +130,6 @@ func InitWebInstalled(ctx context.Context) { mustInit(uinotification.Init) mustInitCtx(ctx, archiver.Init) - highlight.NewContext() external.RegisterRenderers() markup.Init(markup_service.FormalRenderHelperFuncs()) diff --git a/routers/web/repo/blame.go b/routers/web/repo/blame.go index ab3aecbbe7..25eb88eefc 100644 --- a/routers/web/repo/blame.go +++ b/routers/web/repo/blame.go @@ -267,7 +267,7 @@ func renderBlame(ctx *context.Context, blameParts []*gitrepo.BlamePart, commitNa bufContent := buf.Bytes() bufContent = charset.ToUTF8(bufContent, charset.ConvertOpts{}) - highlighted, lexerName := highlight.Code(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent)) + highlighted, lexerName := highlight.RenderCodeSlowGuess(path.Base(ctx.Repo.TreePath), language, util.UnsafeBytesToString(bufContent)) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlighted) for i, br := range rows { var line template.HTML diff --git a/routers/web/repo/view_file.go b/routers/web/repo/view_file.go index 167cd5f927..aca5df944d 100644 --- a/routers/web/repo/view_file.go +++ b/routers/web/repo/view_file.go @@ -124,11 +124,11 @@ func handleFileViewRenderSource(ctx *context.Context, filename string, attrs *at } language := attrs.GetLanguage().Value() - fileContent, lexerName, err := highlight.File(filename, language, buf) + fileContent, lexerName, err := highlight.RenderFullFile(filename, language, buf) ctx.Data["LexerName"] = lexerName if err != nil { - log.Error("highlight.File failed, fallback to plain text: %v", err) - fileContent = highlight.PlainText(buf) + log.Error("highlight.RenderFullFile failed, fallback to plain text: %v", err) + fileContent = highlight.RenderPlainText(buf) } status := &charset.EscapeStatus{} statuses := make([]*charset.EscapeStatus, len(fileContent)) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 06c19b90d9..3728f50d21 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -331,7 +331,7 @@ func (diffSection *DiffSection) getLineContentForRender(lineIdx int, diffLine *D if setting.Git.DisableDiffHighlight { return template.HTML(html.EscapeString(diffLine.Content[1:])) } - h, _ = highlight.Code(diffSection.FileName, fileLanguage, diffLine.Content[1:]) + h, _ = highlight.RenderCodeFast(diffSection.FileName, fileLanguage, diffLine.Content[1:]) return h } @@ -1349,7 +1349,7 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{})) - highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) + highlightedNewContent, _ := highlight.RenderCodeFast(diffFile.Name, diffFile.Language, content) unsafeLines := highlight.UnsafeSplitHighlightedLines(highlightedNewContent) lines := make(map[int]template.HTML, len(unsafeLines)) // only save the highlighted lines we need, but not the whole file, to save memory