// Copyright 2014 The Gogs Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. package markdown import ( "bytes" "fmt" "io" "net/url" "path" "path/filepath" "regexp" "strings" "github.com/Unknwon/com" "github.com/microcosm-cc/bluemonday" "github.com/russross/blackfriday" "golang.org/x/net/html" "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/setting" ) // Issue name styles const ( IssueNameStyleNumeric = "numeric" IssueNameStyleAlphanumeric = "alphanumeric" ) // Sanitizer markdown sanitizer var Sanitizer = bluemonday.UGCPolicy() // BuildSanitizer initializes sanitizer with allowed attributes based on settings. // This function should only be called once during entire application lifecycle. func BuildSanitizer() { // Normal markdown-stuff Sanitizer.AllowAttrs("class").Matching(regexp.MustCompile(`[\p{L}\p{N}\s\-_',:\[\]!\./\\\(\)&]*`)).OnElements("code", "div", "ul", "ol", "dl") // Checkboxes Sanitizer.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input") Sanitizer.AllowAttrs("checked", "disabled").OnElements("input") Sanitizer.AllowNoAttrs().OnElements("label") // Custom URL-Schemes Sanitizer.AllowURLSchemes(setting.Markdown.CustomURLSchemes...) } // IsMarkdownFile reports whether name looks like a Markdown file // based on its extension. func IsMarkdownFile(name string) bool { extension := strings.ToLower(filepath.Ext(name)) for _, ext := range setting.Markdown.FileExtensions { if strings.ToLower(ext) == extension { return true } } return false } // IsReadmeFile reports whether name looks like a README file // based on its name. func IsReadmeFile(name string) bool { name = strings.ToLower(name) if len(name) < 6 { return false } else if len(name) == 6 { return name == "readme" } return name[:7] == "readme." } var ( // MentionPattern matches string that mentions someone, e.g. @Unknwon MentionPattern = regexp.MustCompile(`(\s|^|\W)@[0-9a-zA-Z-_\.]+`) // IssueNumericPattern matches string that references to a numeric issue, e.g. #1287 IssueNumericPattern = regexp.MustCompile(`( |^|\()#[0-9]+\b`) // IssueAlphanumericPattern matches string that references to an alphanumeric issue, e.g. ABC-1234 IssueAlphanumericPattern = regexp.MustCompile(`( |^|\()[A-Z]{1,10}-[1-9][0-9]*\b`) // CrossReferenceIssueNumericPattern matches string that references a numeric issue in a different repository // e.g. gogits/gogs#12345 CrossReferenceIssueNumericPattern = regexp.MustCompile(`( |^)[0-9a-zA-Z]+/[0-9a-zA-Z]+#[0-9]+\b`) // Sha1CurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae // FIXME: this pattern matches pure numbers as well, right now we do a hack to check in renderSha1CurrentPattern // by converting string to a number. Sha1CurrentPattern = regexp.MustCompile(`(?:^|\s|\()[0-9a-f]{40}\b`) // ShortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax ShortLinkPattern = regexp.MustCompile(`(\[\[.*\]\]\w*)`) // AnySHA1Pattern allows to split url containing SHA into parts AnySHA1Pattern = regexp.MustCompile(`http\S+//(\S+)/(\S+)/(\S+)/(\S+)/([0-9a-f]{40})(?:/?([^#\s]+)?(?:#(\S+))?)?`) // IssueFullPattern allows to split issue (and pull) URLs into parts IssueFullPattern = regexp.MustCompile(`(?:^|\s|\()http\S+//((?:[^\s/]+/)+)((?:\w{1,10}-)?[1-9][0-9]*)([\?|#]\S+.(\S+)?)?\b`) validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`) ) // isLink reports whether link fits valid format. func isLink(link []byte) bool { return validLinksPattern.Match(link) } // FindAllMentions matches mention patterns in given content // and returns a list of found user names without @ prefix. func FindAllMentions(content string) []string { mentions := MentionPattern.FindAllString(content, -1) for i := range mentions { mentions[i] = mentions[i][strings.Index(mentions[i], "@")+1:] // Strip @ character } return mentions } // Renderer is a extended version of underlying render object. type Renderer struct { blackfriday.Renderer urlPrefix string isWikiMarkdown bool } // Link defines how formal links should be processed to produce corresponding HTML elements. func (r *Renderer) Link(out *bytes.Buffer, link []byte, title []byte, content []byte) { if len(link) > 0 && !isLink(link) { if link[0] != '#' { mLink := URLJoin(r.urlPrefix, string(link)) if r.isWikiMarkdown { mLink = URLJoin(r.urlPrefix, "wiki", string(link)) } link = []byte(mLink) } } r.Renderer.Link(out, link, title, content) } // List renders markdown bullet or digit lists to HTML func (r *Renderer) List(out *bytes.Buffer, text func() bool, flags int) { marker := out.Len() if out.Len() > 0 { out.WriteByte('\n') } if flags&blackfriday.LIST_TYPE_DEFINITION != 0 { out.WriteString("
")) { prefix = "
" } switch { case bytes.HasPrefix(text, []byte(prefix+"[ ] ")): text = append([]byte(`
`), text[3+len(prefix):]...) case bytes.HasPrefix(text, []byte(prefix+"[x] ")): text = append([]byte(``), text[3+len(prefix):]...) } if prefix != "" { text = bytes.Replace(text, []byte(""), []byte{}, 1) } r.Renderer.ListItem(out, text, flags) } // Note: this section is for purpose of increase performance and // reduce memory allocation at runtime since they are constant literals. var ( svgSuffix = []byte(".svg") svgSuffixWithMark = []byte(".svg?") ) // Image defines how images should be processed to produce corresponding HTML elements. func (r *Renderer) Image(out *bytes.Buffer, link []byte, title []byte, alt []byte) { prefix := r.urlPrefix if r.isWikiMarkdown { prefix = URLJoin(prefix, "wiki", "src") } prefix = strings.Replace(prefix, "/src/", "/raw/", 1) if len(link) > 0 { if isLink(link) { // External link with .svg suffix usually means CI status. // TODO: define a keyword to allow non-svg images render as external link. if bytes.HasSuffix(link, svgSuffix) || bytes.Contains(link, svgSuffixWithMark) { r.Renderer.Image(out, link, title, alt) return } } else { if link[0] != '/' { if !strings.HasSuffix(prefix, "/") { prefix += "/" } } link = []byte(url.QueryEscape(prefix + string(link))) } } out.WriteString(``) r.Renderer.Image(out, link, title, alt) out.WriteString("") } // cutoutVerbosePrefix cutouts URL prefix including sub-path to // return a clean unified string of request URL path. func cutoutVerbosePrefix(prefix string) string { if len(prefix) == 0 || prefix[0] != '/' { return prefix } count := 0 for i := 0; i < len(prefix); i++ { if prefix[i] == '/' { count++ } if count >= 3+setting.AppSubURLDepth { return prefix[:i] } } return prefix } // URLJoin joins url components, like path.Join, but preserving contents func URLJoin(elem ...string) string { res := "" last := len(elem) - 1 for i, item := range elem { res += item if !strings.HasSuffix(res, "/") && i != last { res += "/" } } return res } // RenderIssueIndexPattern renders issue indexes to corresponding links. func RenderIssueIndexPattern(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { urlPrefix = cutoutVerbosePrefix(urlPrefix) pattern := IssueNumericPattern if metas["style"] == IssueNameStyleAlphanumeric { pattern = IssueAlphanumericPattern } ms := pattern.FindAll(rawBytes, -1) for _, m := range ms { if m[0] == ' ' || m[0] == '(' { m = m[1:] // ignore leading space or opening parentheses } var link string if metas == nil { link = fmt.Sprintf(`%s`, URLJoin(urlPrefix, "issues", string(m[1:])), m) } else { // Support for external issue tracker if metas["style"] == IssueNameStyleAlphanumeric { metas["index"] = string(m) } else { metas["index"] = string(m[1:]) } link = fmt.Sprintf(`%s`, com.Expand(metas["format"], metas), m) } rawBytes = bytes.Replace(rawBytes, m, []byte(link), 1) } return rawBytes } // IsSameDomain checks if given url string has the same hostname as current Gitea instance func IsSameDomain(s string) bool { if uapp, err := url.Parse(setting.AppURL); err == nil { if u, err := url.Parse(s); err == nil { return u.Host == uapp.Host } return false } return false } // renderFullSha1Pattern renders SHA containing URLs func renderFullSha1Pattern(rawBytes []byte, urlPrefix string) []byte { ms := AnySHA1Pattern.FindAllSubmatch(rawBytes, -1) for _, m := range ms { all := m[0] paths := string(m[1]) var path = "//" + paths author := string(m[2]) repoName := string(m[3]) path = URLJoin(path, author, repoName) ltype := "src" itemType := m[4] if IsSameDomain(paths) { ltype = string(itemType) } else if string(itemType) == "commit" { ltype = "commit" } sha := m[5] var subtree string if len(m) > 6 && len(m[6]) > 0 { subtree = string(m[6]) } var line []byte if len(m) > 7 && len(m[7]) > 0 { line = m[7] } urlSuffix := "" text := base.ShortSha(string(sha)) if subtree != "" { urlSuffix = "/" + subtree text += urlSuffix } if line != nil { value := string(line) urlSuffix += "#" urlSuffix += value text += " (" text += value text += ")" } rawBytes = bytes.Replace(rawBytes, all, []byte(fmt.Sprintf( `%s`, URLJoin(path, ltype, string(sha))+urlSuffix, text)), -1) } return rawBytes } // renderFullIssuePattern renders issues-like URLs func renderFullIssuePattern(rawBytes []byte, urlPrefix string) []byte { ms := IssueFullPattern.FindAllSubmatch(rawBytes, -1) for _, m := range ms { all := m[0] paths := bytes.Split(m[1], []byte("/")) paths = paths[:len(paths)-1] if bytes.HasPrefix(paths[0], []byte("gist.")) { continue } var path string if len(paths) > 3 { // Internal one path = URLJoin(urlPrefix, "issues") } else { path = "//" + string(m[1]) } id := string(m[2]) path = URLJoin(path, id) var comment []byte if len(m) > 3 { comment = m[3] } urlSuffix := "" text := "#" + id if comment != nil { urlSuffix += string(comment) text += " " } rawBytes = bytes.Replace(rawBytes, all, []byte(fmt.Sprintf( `%s`, path, urlSuffix, text)), -1) } return rawBytes } func firstIndexOfByte(sl []byte, target byte) int { for i := 0; i < len(sl); i++ { if sl[i] == target { return i } } return -1 } func lastIndexOfByte(sl []byte, target byte) int { for i := len(sl) - 1; i >= 0; i-- { if sl[i] == target { return i } } return -1 } // renderShortLinks processes [[syntax]] func renderShortLinks(rawBytes []byte, urlPrefix string, noLink bool) []byte { ms := ShortLinkPattern.FindAll(rawBytes, -1) for _, m := range ms { orig := bytes.TrimSpace(m) m = orig[2:] tailPos := lastIndexOfByte(m, ']') + 1 tail := []byte{} if tailPos < len(m) { tail = m[tailPos:] m = m[:tailPos-1] } m = m[:len(m)-2] props := map[string]string{} // MediaWiki uses [[link|text]], while GitHub uses [[text|link]] // It makes page handling terrible, but we prefer GitHub syntax // And fall back to MediaWiki only when it is obvious from the look // Of text and link contents sl := bytes.Split(m, []byte("|")) for _, v := range sl { switch bytes.Count(v, []byte("=")) { // Piped args without = sign, these are mandatory arguments case 0: { sv := string(v) if props["name"] == "" { if isLink(v) { // If we clearly see it is a link, we save it so // But first we need to ensure, that if both mandatory args provided // look like links, we stick to GitHub syntax if props["link"] != "" { props["name"] = props["link"] } props["link"] = strings.TrimSpace(sv) } else { props["name"] = sv } } else { props["link"] = strings.TrimSpace(sv) } } // Piped args with = sign, these are optional arguments case 1: { sep := firstIndexOfByte(v, '=') key, val := string(v[:sep]), html.UnescapeString(string(v[sep+1:])) lastCharIndex := len(val) - 1 if (val[0] == '"' || val[0] == '\'') && (val[lastCharIndex] == '"' || val[lastCharIndex] == '\'') { val = val[1:lastCharIndex] } props[key] = val } } } var name string var link string if props["link"] != "" { link = props["link"] } else if props["name"] != "" { link = props["name"] } if props["title"] != "" { name = props["title"] } else if props["name"] != "" { name = props["name"] } else { name = link } name += string(tail) image := false ext := filepath.Ext(string(link)) if ext != "" { switch ext { case ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp", ".gif", ".bmp", ".ico", ".svg": { image = true } } } absoluteLink := isLink([]byte(link)) if !absoluteLink { link = url.QueryEscape(link) } if image { if !absoluteLink { link = URLJoin(urlPrefix, "wiki", "raw", link) } title := props["title"] if title == "" { title = props["alt"] } if title == "" { title = path.Base(string(name)) } alt := props["alt"] if alt == "" { alt = name } if alt != "" { alt = `alt="` + alt + `"` } name = fmt.Sprintf(``, link, alt, title) } else if !absoluteLink { link = URLJoin(urlPrefix, "wiki", link) } if noLink { rawBytes = bytes.Replace(rawBytes, orig, []byte(name), -1) } else { rawBytes = bytes.Replace(rawBytes, orig, []byte(fmt.Sprintf(`%s`, link, name)), -1) } } return rawBytes } // RenderCrossReferenceIssueIndexPattern renders issue indexes from other repositories to corresponding links. func RenderCrossReferenceIssueIndexPattern(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { ms := CrossReferenceIssueNumericPattern.FindAll(rawBytes, -1) for _, m := range ms { if m[0] == ' ' || m[0] == '(' { m = m[1:] // ignore leading space or opening parentheses } repo := string(bytes.Split(m, []byte("#"))[0]) issue := string(bytes.Split(m, []byte("#"))[1]) link := fmt.Sprintf(`%s`, URLJoin(urlPrefix, repo, "issues", issue), m) rawBytes = bytes.Replace(rawBytes, m, []byte(link), 1) } return rawBytes } // renderSha1CurrentPattern renders SHA1 strings to corresponding links that assumes in the same repository. func renderSha1CurrentPattern(rawBytes []byte, urlPrefix string) []byte { ms := Sha1CurrentPattern.FindAllSubmatch(rawBytes, -1) for _, m := range ms { all := m[0] if com.StrTo(all).MustInt() > 0 { continue } rawBytes = bytes.Replace(rawBytes, all, []byte(fmt.Sprintf( `%s`, URLJoin(urlPrefix, "commit", string(all)), base.ShortSha(string(all)))), -1) } return rawBytes } // RenderSpecialLink renders mentions, indexes and SHA1 strings to corresponding links. func RenderSpecialLink(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { ms := MentionPattern.FindAll(rawBytes, -1) for _, m := range ms { m = m[bytes.Index(m, []byte("@")):] rawBytes = bytes.Replace(rawBytes, m, []byte(fmt.Sprintf(`%s`, URLJoin(setting.AppURL, string(m[1:])), m)), -1) } rawBytes = renderShortLinks(rawBytes, urlPrefix, false) rawBytes = RenderIssueIndexPattern(rawBytes, urlPrefix, metas) rawBytes = RenderCrossReferenceIssueIndexPattern(rawBytes, urlPrefix, metas) rawBytes = renderFullSha1Pattern(rawBytes, urlPrefix) rawBytes = renderSha1CurrentPattern(rawBytes, urlPrefix) rawBytes = renderFullIssuePattern(rawBytes, urlPrefix) return rawBytes } // RenderRaw renders Markdown to HTML without handling special links. func RenderRaw(body []byte, urlPrefix string, wikiMarkdown bool) []byte { htmlFlags := 0 htmlFlags |= blackfriday.HTML_SKIP_STYLE htmlFlags |= blackfriday.HTML_OMIT_CONTENTS renderer := &Renderer{ Renderer: blackfriday.HtmlRenderer(htmlFlags, "", ""), urlPrefix: urlPrefix, isWikiMarkdown: wikiMarkdown, } // set up the parser extensions := 0 extensions |= blackfriday.EXTENSION_NO_INTRA_EMPHASIS extensions |= blackfriday.EXTENSION_TABLES extensions |= blackfriday.EXTENSION_FENCED_CODE extensions |= blackfriday.EXTENSION_STRIKETHROUGH extensions |= blackfriday.EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK if setting.Markdown.EnableHardLineBreak { extensions |= blackfriday.EXTENSION_HARD_LINE_BREAK } body = blackfriday.Markdown(body, renderer, extensions) return body } var ( leftAngleBracket = []byte("") rightAngleBracket = []byte(">") ) var noEndTags = []string{"img", "input", "br", "hr"} // PostProcess treats different types of HTML differently, // and only renders special links for plain text blocks. func PostProcess(rawHTML []byte, urlPrefix string, metas map[string]string) []byte { startTags := make([]string, 0, 5) var buf bytes.Buffer tokenizer := html.NewTokenizer(bytes.NewReader(rawHTML)) OUTER_LOOP: for html.ErrorToken != tokenizer.Next() { token := tokenizer.Token() switch token.Type { case html.TextToken: buf.Write(RenderSpecialLink([]byte(token.String()), urlPrefix, metas)) case html.StartTagToken: buf.WriteString(token.String()) tagName := token.Data // If this is an excluded tag, we skip processing all output until a close tag is encountered. if strings.EqualFold("a", tagName) || strings.EqualFold("code", tagName) || strings.EqualFold("pre", tagName) { stackNum := 1 for html.ErrorToken != tokenizer.Next() { token = tokenizer.Token() // Copy the token to the output verbatim buf.Write(renderShortLinks([]byte(token.String()), urlPrefix, true)) if token.Type == html.StartTagToken { if !com.IsSliceContainsStr(noEndTags, token.Data) { stackNum++ } } // If this is the close tag to the outer-most, we are done if token.Type == html.EndTagToken { stackNum-- if stackNum <= 0 && strings.EqualFold(tagName, token.Data) { break } } } continue OUTER_LOOP } if !com.IsSliceContainsStr(noEndTags, token.Data) { startTags = append(startTags, token.Data) } case html.EndTagToken: if len(startTags) == 0 { buf.WriteString(token.String()) break } buf.Write(leftAngleBracket) buf.WriteString(startTags[len(startTags)-1]) buf.Write(rightAngleBracket) startTags = startTags[:len(startTags)-1] default: buf.WriteString(token.String()) } } if io.EOF == tokenizer.Err() { return buf.Bytes() } // If we are not at the end of the input, then some other parsing error has occurred, // so return the input verbatim. return rawHTML } // Render renders Markdown to HTML with all specific handling stuff. func render(rawBytes []byte, urlPrefix string, metas map[string]string, isWikiMarkdown bool) []byte { urlPrefix = strings.Replace(urlPrefix, " ", "%20", -1) result := RenderRaw(rawBytes, urlPrefix, isWikiMarkdown) result = PostProcess(result, urlPrefix, metas) result = Sanitizer.SanitizeBytes(result) return result } // Render renders Markdown to HTML with all specific handling stuff. func Render(rawBytes []byte, urlPrefix string, metas map[string]string) []byte { return render(rawBytes, urlPrefix, metas, false) } // RenderString renders Markdown to HTML with special links and returns string type. func RenderString(raw, urlPrefix string, metas map[string]string) string { return string(render([]byte(raw), urlPrefix, metas, false)) } // RenderWiki renders markdown wiki page to HTML and return HTML string func RenderWiki(rawBytes []byte, urlPrefix string, metas map[string]string) string { return string(render(rawBytes, urlPrefix, metas, true)) }