podgrab/internal/sanitize/sanitize.go

// Package sanitize provides functions for sanitizing text.
package sanitize

import (
	"bytes"
	"html"
	"html/template"
	"io"
	"path"
	"regexp"
	"strings"

	parser "golang.org/x/net/html"
)

var (
	ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}

	defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}

	defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
)

// HTMLAllowing sanitizes html, allowing some tags.
// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
func HTMLAllowing(s string, args ...[]string) (string, error) {

	allowedTags := defaultTags
	if len(args) > 0 {
		allowedTags = args[0]
	}
	allowedAttributes := defaultAttributes
	if len(args) > 1 {
		allowedAttributes = args[1]
	}

	// Parse the html
	tokenizer := parser.NewTokenizer(strings.NewReader(s))

	buffer := bytes.NewBufferString("")
	ignore := ""

	for {
		tokenType := tokenizer.Next()
		token := tokenizer.Token()

		switch tokenType {

		case parser.ErrorToken:
			err := tokenizer.Err()
			if err == io.EOF {
				return buffer.String(), nil
			}
			return "", err

		case parser.StartTagToken:

			if len(ignore) == 0 && includes(allowedTags, token.Data) {
				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
				buffer.WriteString(token.String())
			} else if includes(ignoreTags, token.Data) {
				ignore = token.Data
			}

		case parser.SelfClosingTagToken:

			if len(ignore) == 0 && includes(allowedTags, token.Data) {
				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
				buffer.WriteString(token.String())
			} else if token.Data == ignore {
				ignore = ""
			}

		case parser.EndTagToken:
			if len(ignore) == 0 && includes(allowedTags, token.Data) {
				token.Attr = []parser.Attribute{}
				buffer.WriteString(token.String())
			} else if token.Data == ignore {
				ignore = ""
			}

		case parser.TextToken:
			// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
			if ignore == "" {
				buffer.WriteString(token.String())
			}
		case parser.CommentToken:
			// We ignore comments by default
		case parser.DoctypeToken:
			// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
		default:
			// We ignore unknown token types by default

		}

	}

}

// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
func HTML(s string) (output string) {

	// Shortcut strings with no tags in them
	if !strings.ContainsAny(s, "<>") {
		output = s
	} else {

		// First remove line breaks etc as these have no meaning outside html tags (except pre)
		// this means pre sections will lose formatting... but will result in less unintentional paras.
		s = strings.Replace(s, "\n", "", -1)

		// Then replace line breaks with newlines, to preserve that formatting
		s = strings.Replace(s, "</p>", "\n", -1)
		s = strings.Replace(s, "<br>", "\n", -1)
		s = strings.Replace(s, "</br>", "\n", -1)
		s = strings.Replace(s, "<br/>", "\n", -1)
		s = strings.Replace(s, "<br />", "\n", -1)

		// Walk through the string removing all tags
		b := bytes.NewBufferString("")
		inTag := false
		for _, r := range s {
			switch r {
			case '<':
				inTag = true
			case '>':
				inTag = false
			default:
				if !inTag {
					b.WriteRune(r)
				}
			}
		}
		output = b.String()
	}

	// Remove a few common harmless entities, to arrive at something more like plain text
	output = strings.Replace(output, "&#8216;", "'", -1)
	output = strings.Replace(output, "&#8217;", "'", -1)
	output = strings.Replace(output, "&#8220;", "\"", -1)
	output = strings.Replace(output, "&#8221;", "\"", -1)
	output = strings.Replace(output, "&nbsp;", " ", -1)
	output = strings.Replace(output, "&quot;", "\"", -1)
	output = strings.Replace(output, "&apos;", "'", -1)

	// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
	output = html.UnescapeString(output)

	// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
	output = template.HTMLEscapeString(output)

	// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
	output = strings.Replace(output, "&#34;", "\"", -1)
	output = strings.Replace(output, "&#39;", "'", -1)
	output = strings.Replace(output, "&amp; ", "& ", -1)     // NB space after
	output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after

	return output
}

// We are very restrictive as this is intended for ascii url slugs
var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)

// Path makes a string safe to use as a URL path,
// removing accents and replacing separators with -.
// The path may still start at / and is not intended
// for use as a file system path without prefix.
func Path(s string) string {
	// Start with lowercase string
	filePath := strings.ToLower(s)
	filePath = strings.Replace(filePath, "..", "", -1)
	filePath = path.Clean(filePath)

	// Remove illegal characters for paths, flattening accents
	// and replacing some common separators with -
	filePath = cleanString(filePath, illegalPath)

	// NB this may be of length 0, caller must check
	return filePath
}

// Remove all other unrecognised characters apart from
var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)

// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
func Name(s string) string {
	// Start with lowercase string
	fileName := s
	fileName = path.Clean(path.Base(fileName))

	// Remove illegal characters for names, replacing some common separators with -
	fileName = cleanString(fileName, illegalName)

	// NB this may be of length 0, caller must check
	return fileName
}

// Replace these separators with -
var baseNameSeparators = regexp.MustCompile(`[./]`)

// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
// No attempt is made to normalise a path or normalise case.
func BaseName(s string) string {

	// Replace certain joining characters with a dash
	baseName := baseNameSeparators.ReplaceAllString(s, "-")

	// Remove illegal characters for names, replacing some common separators with -
	baseName = cleanString(baseName, illegalName)

	// NB this may be of length 0, caller must check
	return baseName
}

// A very limited list of transliterations to catch common european names translated to urls.
// This set could be expanded with at least caps and many more characters.
var transliterations = map[rune]string{
	'À': "A",
	'Á': "A",
	'Â': "A",
	'Ã': "A",
	'Ä': "A",
	'Å': "AA",
	'Æ': "AE",
	'Ç': "C",
	'È': "E",
	'É': "E",
	'Ê': "E",
	'Ë': "E",
	'Ì': "I",
	'Í': "I",
	'Î': "I",
	'Ï': "I",
	'Ð': "D",
	'Ł': "L",
	'Ñ': "N",
	'Ò': "O",
	'Ó': "O",
	'Ô': "O",
	'Õ': "O",
	'Ö': "OE",
	'Ø': "OE",
	'Œ': "OE",
	'Ù': "U",
	'Ú': "U",
	'Ü': "UE",
	'Û': "U",
	'Ý': "Y",
	'Þ': "TH",
	'ẞ': "SS",
	'à': "a",
	'á': "a",
	'â': "a",
	'ã': "a",
	'ä': "ae",
	'å': "aa",
	'æ': "ae",
	'ç': "c",
	'è': "e",
	'é': "e",
	'ê': "e",
	'ë': "e",
	'ì': "i",
	'í': "i",
	'î': "i",
	'ï': "i",
	'ð': "d",
	'ł': "l",
	'ñ': "n",
	'ń': "n",
	'ò': "o",
	'ó': "o",
	'ô': "o",
	'õ': "o",
	'ō': "o",
	'ö': "oe",
	'ø': "oe",
	'œ': "oe",
	'ś': "s",
	'ù': "u",
	'ú': "u",
	'û': "u",
	'ū': "u",
	'ü': "ue",
	'ý': "y",
	'ÿ': "y",
	'ż': "z",
	'þ': "th",
	'ß': "ss",
}

// Accents replaces a set of accented characters with ascii equivalents.
func Accents(s string) string {
	// Replace some common accent characters
	b := bytes.NewBufferString("")
	for _, c := range s {
		// Check transliterations first
		if val, ok := transliterations[c]; ok {
			b.WriteString(val)
		} else {
			b.WriteRune(c)
		}
	}
	return b.String()
}

var (
	// If the attribute contains data: or javascript: anywhere, ignore it
	// we don't allow this in attributes as it is so frequently used for xss
	// NB we allow spaces in the value, and lowercase.
	illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)

	// We are far more restrictive with href attributes.
	legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
)

// cleanAttributes returns an array of attributes after removing malicious ones.
func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
	if len(a) == 0 {
		return a
	}

	var cleaned []parser.Attribute
	for _, attr := range a {
		if includes(allowed, attr.Key) {

			val := strings.ToLower(attr.Val)

			// Check for illegal attribute values
			if illegalAttr.FindString(val) != "" {
				attr.Val = ""
			}

			// Check for legal href values - / mailto:// http:// or https://
			if attr.Key == "href" {
				if legalHrefAttr.FindString(val) == "" {
					attr.Val = ""
				}
			}

			// If we still have an attribute, append it to the array
			if attr.Val != "" {
				cleaned = append(cleaned, attr)
			}
		}
	}
	return cleaned
}

// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
var (
	separators = regexp.MustCompile(`[!&_="#|+?:]`)

	dashes = regexp.MustCompile(`[\-]+`)
)

// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
func cleanString(s string, r *regexp.Regexp) string {

	// Remove any trailing space to avoid ending on -
	s = strings.Trim(s, " ")

	// Flatten accents first so that if we remove non-ascii we still get a legible name
	s = Accents(s)

	// Replace certain joining characters with a dash
	s = separators.ReplaceAllString(s, "-")

	// Remove all other unrecognised characters - NB we do allow any printable characters
	//s = r.ReplaceAllString(s, "")

	// Remove any multiple dashes caused by replacements above
	s = dashes.ReplaceAllString(s, "-")

	return s
}

// includes checks for inclusion of a string in a []string.
func includes(a []string, s string) bool {
	for _, as := range a {
		if as == s {
			return true
		}
	}
	return false
}
fix naming issues 4 years ago			`// Package sanitize provides functions for sanitizing text.`
			`package sanitize`

			`import (`
			`"bytes"`
			`"html"`
			`"html/template"`
			`"io"`
			`"path"`
			`"regexp"`
			`"strings"`

			`parser "golang.org/x/net/html"`
			`)`

			`var (`
			`ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}`

			`defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}`

			`defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}`
			`)`

			`// HTMLAllowing sanitizes html, allowing some tags.`
			`// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.`
			`func HTMLAllowing(s string, args ...[]string) (string, error) {`

			`allowedTags := defaultTags`
			`if len(args) > 0 {`
			`allowedTags = args[0]`
			`}`
			`allowedAttributes := defaultAttributes`
			`if len(args) > 1 {`
			`allowedAttributes = args[1]`
			`}`

			`// Parse the html`
			`tokenizer := parser.NewTokenizer(strings.NewReader(s))`

			`buffer := bytes.NewBufferString("")`
			`ignore := ""`

			`for {`
			`tokenType := tokenizer.Next()`
			`token := tokenizer.Token()`

			`switch tokenType {`

			`case parser.ErrorToken:`
			`err := tokenizer.Err()`
			`if err == io.EOF {`
			`return buffer.String(), nil`
			`}`
			`return "", err`

			`case parser.StartTagToken:`

			`if len(ignore) == 0 && includes(allowedTags, token.Data) {`
			`token.Attr = cleanAttributes(token.Attr, allowedAttributes)`
			`buffer.WriteString(token.String())`
			`} else if includes(ignoreTags, token.Data) {`
			`ignore = token.Data`
			`}`

			`case parser.SelfClosingTagToken:`

			`if len(ignore) == 0 && includes(allowedTags, token.Data) {`
			`token.Attr = cleanAttributes(token.Attr, allowedAttributes)`
			`buffer.WriteString(token.String())`
			`} else if token.Data == ignore {`
			`ignore = ""`
			`}`

			`case parser.EndTagToken:`
			`if len(ignore) == 0 && includes(allowedTags, token.Data) {`
			`token.Attr = []parser.Attribute{}`
			`buffer.WriteString(token.String())`
			`} else if token.Data == ignore {`
			`ignore = ""`
			`}`

			`case parser.TextToken:`
			`// We allow text content through, unless ignoring this entire tag and its contents (including other tags)`
			`if ignore == "" {`
			`buffer.WriteString(token.String())`
			`}`
			`case parser.CommentToken:`
			`// We ignore comments by default`
			`case parser.DoctypeToken:`
			`// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text`
			`default:`
			`// We ignore unknown token types by default`

			`}`

			`}`

			`}`

			`// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.`
			`// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.`
			`func HTML(s string) (output string) {`

			`// Shortcut strings with no tags in them`
			`if !strings.ContainsAny(s, "<>") {`
			`output = s`
			`} else {`

			`// First remove line breaks etc as these have no meaning outside html tags (except pre)`
			`// this means pre sections will lose formatting... but will result in less unintentional paras.`
			`s = strings.Replace(s, "\n", "", -1)`

			`// Then replace line breaks with newlines, to preserve that formatting`
			`s = strings.Replace(s, "</p>", "\n", -1)`
			`s = strings.Replace(s, "<br>", "\n", -1)`
			`s = strings.Replace(s, "</br>", "\n", -1)`
			`s = strings.Replace(s, "<br/>", "\n", -1)`
			`s = strings.Replace(s, "<br />", "\n", -1)`

			`// Walk through the string removing all tags`
			`b := bytes.NewBufferString("")`
			`inTag := false`
			`for _, r := range s {`
			`switch r {`
			`case '<':`
			`inTag = true`
			`case '>':`
			`inTag = false`
			`default:`
			`if !inTag {`
			`b.WriteRune(r)`
			`}`
			`}`
			`}`
			`output = b.String()`
			`}`

			`// Remove a few common harmless entities, to arrive at something more like plain text`
			`output = strings.Replace(output, "‘", "'", -1)`
			`output = strings.Replace(output, "’", "'", -1)`
			`output = strings.Replace(output, "“", "\"", -1)`
			`output = strings.Replace(output, "”", "\"", -1)`
			`output = strings.Replace(output, " ", " ", -1)`
			`output = strings.Replace(output, """, "\"", -1)`
			`output = strings.Replace(output, "'", "'", -1)`

			`// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)`
			`output = html.UnescapeString(output)`

			`// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".`
			`output = template.HTMLEscapeString(output)`

			`// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString`
			`output = strings.Replace(output, """, "\"", -1)`
			`output = strings.Replace(output, "'", "'", -1)`
			`output = strings.Replace(output, "& ", "& ", -1) // NB space after`
			`output = strings.Replace(output, "&amp; ", "& ", -1) // NB space after`

			`return output`
			`}`

			`// We are very restrictive as this is intended for ascii url slugs`
			var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)

			`// Path makes a string safe to use as a URL path,`
			`// removing accents and replacing separators with -.`
			`// The path may still start at / and is not intended`
			`// for use as a file system path without prefix.`
			`func Path(s string) string {`
			`// Start with lowercase string`
			`filePath := strings.ToLower(s)`
			`filePath = strings.Replace(filePath, "..", "", -1)`
			`filePath = path.Clean(filePath)`

			`// Remove illegal characters for paths, flattening accents`
			`// and replacing some common separators with -`
			`filePath = cleanString(filePath, illegalPath)`

			`// NB this may be of length 0, caller must check`
			`return filePath`
			`}`

			`// Remove all other unrecognised characters apart from`
			var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)

			`// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.`
			`func Name(s string) string {`
			`// Start with lowercase string`
			`fileName := s`
			`fileName = path.Clean(path.Base(fileName))`

			`// Remove illegal characters for names, replacing some common separators with -`
			`fileName = cleanString(fileName, illegalName)`

			`// NB this may be of length 0, caller must check`
			`return fileName`
			`}`

			`// Replace these separators with -`
			var baseNameSeparators = regexp.MustCompile(`[./]`)

			`// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.`
			`// No attempt is made to normalise a path or normalise case.`
			`func BaseName(s string) string {`

			`// Replace certain joining characters with a dash`
			`baseName := baseNameSeparators.ReplaceAllString(s, "-")`

			`// Remove illegal characters for names, replacing some common separators with -`
			`baseName = cleanString(baseName, illegalName)`

			`// NB this may be of length 0, caller must check`
			`return baseName`
			`}`

			`// A very limited list of transliterations to catch common european names translated to urls.`
			`// This set could be expanded with at least caps and many more characters.`
			`var transliterations = map[rune]string{`
			`'À': "A",`
			`'Á': "A",`
			`'Â': "A",`
			`'Ã': "A",`
			`'Ä': "A",`
			`'Å': "AA",`
			`'Æ': "AE",`
			`'Ç': "C",`
			`'È': "E",`
			`'É': "E",`
			`'Ê': "E",`
			`'Ë': "E",`
			`'Ì': "I",`
			`'Í': "I",`
			`'Î': "I",`
			`'Ï': "I",`
			`'Ð': "D",`
			`'Ł': "L",`
			`'Ñ': "N",`
			`'Ò': "O",`
			`'Ó': "O",`
			`'Ô': "O",`
			`'Õ': "O",`
			`'Ö': "OE",`
			`'Ø': "OE",`
			`'Œ': "OE",`
			`'Ù': "U",`
			`'Ú': "U",`
			`'Ü': "UE",`
			`'Û': "U",`
			`'Ý': "Y",`
			`'Þ': "TH",`
			`'ẞ': "SS",`
			`'à': "a",`
			`'á': "a",`
			`'â': "a",`
			`'ã': "a",`
			`'ä': "ae",`
			`'å': "aa",`
			`'æ': "ae",`
			`'ç': "c",`
			`'è': "e",`
			`'é': "e",`
			`'ê': "e",`
			`'ë': "e",`
			`'ì': "i",`
			`'í': "i",`
			`'î': "i",`
			`'ï': "i",`
			`'ð': "d",`
			`'ł': "l",`
			`'ñ': "n",`
			`'ń': "n",`
			`'ò': "o",`
			`'ó': "o",`
			`'ô': "o",`
			`'õ': "o",`
			`'ō': "o",`
			`'ö': "oe",`
			`'ø': "oe",`
			`'œ': "oe",`
			`'ś': "s",`
			`'ù': "u",`
			`'ú': "u",`
			`'û': "u",`
			`'ū': "u",`
			`'ü': "ue",`
			`'ý': "y",`
			`'ÿ': "y",`
			`'ż': "z",`
			`'þ': "th",`
			`'ß': "ss",`
			`}`

			`// Accents replaces a set of accented characters with ascii equivalents.`
			`func Accents(s string) string {`
			`// Replace some common accent characters`
			`b := bytes.NewBufferString("")`
			`for _, c := range s {`
			`// Check transliterations first`
			`if val, ok := transliterations[c]; ok {`
			`b.WriteString(val)`
			`} else {`
			`b.WriteRune(c)`
			`}`
			`}`
			`return b.String()`
			`}`

			`var (`
			`// If the attribute contains data: or javascript: anywhere, ignore it`
			`// we don't allow this in attributes as it is so frequently used for xss`
			`// NB we allow spaces in the value, and lowercase.`
			illegalAttr = regexp.MustCompile(`(d\sa\st\sa\|j\sa\sv\sa\ss\sc\sr\si\sp\st\s)\s:`)

			`// We are far more restrictive with href attributes.`
			legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?\|mailto:\|http://\|https://`)
			`)`

			`// cleanAttributes returns an array of attributes after removing malicious ones.`
			`func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {`
			`if len(a) == 0 {`
			`return a`
			`}`

			`var cleaned []parser.Attribute`
			`for _, attr := range a {`
			`if includes(allowed, attr.Key) {`

			`val := strings.ToLower(attr.Val)`

			`// Check for illegal attribute values`
			`if illegalAttr.FindString(val) != "" {`
			`attr.Val = ""`
			`}`

			`// Check for legal href values - / mailto:// http:// or https://`
			`if attr.Key == "href" {`
			`if legalHrefAttr.FindString(val) == "" {`
			`attr.Val = ""`
			`}`
			`}`

			`// If we still have an attribute, append it to the array`
			`if attr.Val != "" {`
			`cleaned = append(cleaned, attr)`
			`}`
			`}`
			`}`
			`return cleaned`
			`}`

			`// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.`
			`var (`
fixed breaking file download 4 years ago			separators = regexp.MustCompile(`[!&_="#\|+?:]`)
fix naming issues 4 years ago
			dashes = regexp.MustCompile(`[\-]+`)
			`)`

			`// cleanString replaces separators with - and removes characters listed in the regexp provided from string.`
			`// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.`
			`func cleanString(s string, r *regexp.Regexp) string {`

			`// Remove any trailing space to avoid ending on -`
			`s = strings.Trim(s, " ")`

			`// Flatten accents first so that if we remove non-ascii we still get a legible name`
			`s = Accents(s)`

			`// Replace certain joining characters with a dash`
better file name fix 4 years ago			`s = separators.ReplaceAllString(s, "-")`
fix naming issues 4 years ago
			`// Remove all other unrecognised characters - NB we do allow any printable characters`
			`//s = r.ReplaceAllString(s, "")`

			`// Remove any multiple dashes caused by replacements above`
			`s = dashes.ReplaceAllString(s, "-")`

			`return s`
			`}`

			`// includes checks for inclusion of a string in a []string.`
			`func includes(a []string, s string) bool {`
			`for _, as := range a {`
			`if as == s {`
			`return true`
			`}`
			`}`
			`return false`
			`}`