From 33c3cbc9688aa37bb27101eedf5efe9aa38660d0 Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Sun, 30 Sep 2018 04:02:16 +0300 Subject: [PATCH] Detect charset and convert non UTF-8 files for display (#4950) (#4994) * Detect charset and convert non UTF-8 files for display * Refactor and move function to correct module * Revert unrelated changes * More unrelated changes * Duplicate content for small text to have better encoding detection * Check if original content is valid before duplicating it --- modules/base/tool.go | 17 ++++++++++++++++- modules/templates/helper.go | 25 ++++++++++++++++++++++++- routers/repo/view.go | 6 ++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/modules/base/tool.go b/modules/base/tool.go index 2dfd8ffec..d5ec9e83f 100644 --- a/modules/base/tool.go +++ b/modules/base/tool.go @@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) { return "UTF-8", nil } - result, err := chardet.NewTextDetector().DetectBest(content) + textDetector := chardet.NewTextDetector() + var detectContent []byte + if len(content) < 1024 { + // Check if original content is valid + if _, err := textDetector.DetectBest(content); err != nil { + return "", err + } + times := 1024 / len(content) + detectContent = make([]byte, 0, times*len(content)) + for i := 0; i < times; i++ { + detectContent = append(detectContent, content...) + } + } else { + detectContent = content + } + result, err := textDetector.DetectBest(detectContent) if err != nil { return "", err } diff --git a/modules/templates/helper.go b/modules/templates/helper.go index b6c835ad4..ae7d86ed9 100644 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -1,3 +1,4 @@ +// Copyright 2018 The Gitea Authors. All rights reserved. // Copyright 2014 The Gogs Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. @@ -243,7 +244,7 @@ func ToUTF8WithErr(content []byte) (string, error) { } // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't loose data. + // original left over. This way we won't lose data. result, n, err := transform.String(encoding.NewDecoder(), string(content)) if err != nil { result = result + string(content[n:]) @@ -252,6 +253,28 @@ func ToUTF8WithErr(content []byte) (string, error) { return result, err } +// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible +func ToUTF8WithFallback(content []byte) []byte { + charsetLabel, err := base.DetectEncoding(content) + if err != nil || charsetLabel == "UTF-8" { + return content + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return content + } + + // If there is an error, we concatenate the nicely decoded part and the + // original left over. This way we won't lose data. + result, n, err := transform.Bytes(encoding.NewDecoder(), content) + if err != nil { + return append(result, content[n:]...) + } + + return result +} + // ToUTF8 converts content to UTF8 encoding and ignore error func ToUTF8(content string) string { res, _ := ToUTF8WithErr([]byte(content)) diff --git a/routers/repo/view.go b/routers/repo/view.go index 4f1deeae4..6502c7586 100644 --- a/routers/repo/view.go +++ b/routers/repo/view.go @@ -25,6 +25,7 @@ import ( "code.gitea.io/gitea/modules/markup" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/templates" + "github.com/Unknwon/paginater" ) @@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) { ctx.Data["FileSize"] = readmeFile.Size() } else { d, _ := ioutil.ReadAll(dataRc) - buf = append(buf, d...) + buf = templates.ToUTF8WithFallback(append(buf, d...)) + if markup.Type(readmeFile.Name()) != "" { ctx.Data["IsMarkup"] = true ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas())) @@ -201,7 +203,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st } d, _ := ioutil.ReadAll(dataRc) - buf = append(buf, d...) + buf = templates.ToUTF8WithFallback(append(buf, d...)) readmeExist := markup.IsReadmeFile(blob.Name()) ctx.Data["ReadmeExist"] = readmeExist