fix naming issues

4 years ago · e113947f35
parent 319b34831a
commit e113947f35
7 changed files with 747 additions and 7 deletions
--- a/go.mod
+++ b/go.mod
@ -14,7 +14,7 @@ require (
 	go.uber.org/multierr v1.6.0 // indirect
 	go.uber.org/zap v1.16.0
 	golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
-	golang.org/x/net v0.0.0-20201002202402-0a1ea396d57c // indirect
+	golang.org/x/net v0.0.0-20201002202402-0a1ea396d57c
 	gorm.io/driver/sqlite v1.1.3
 	gorm.io/gorm v1.20.2
 )
--- a/internal/sanitize/.gitignore
+++ b/internal/sanitize/.gitignore
@ -0,0 +1,22 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
--- a/internal/sanitize/LICENSE
+++ b/internal/sanitize/LICENSE
@ -0,0 +1,27 @@
+Copyright (c) 2017 Mechanism Design. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/internal/sanitize/README.md
+++ b/internal/sanitize/README.md
@ -0,0 +1,62 @@
+sanitize [![GoDoc](https://godoc.org/github.com/kennygrant/sanitize?status.svg)](https://godoc.org/github.com/kennygrant/sanitize) [![Go Report Card](https://goreportcard.com/badge/github.com/kennygrant/sanitize)](https://goreportcard.com/report/github.com/kennygrant/sanitize) [![CircleCI](https://circleci.com/gh/kennygrant/sanitize.svg?style=svg)](https://circleci.com/gh/kennygrant/sanitize)
+========
+
+Package sanitize provides functions to sanitize html and paths with go (golang).
+
+FUNCTIONS
+
+
+```go
+sanitize.Accents(s string) string
+```
+
+Accents replaces a set of accented characters with ascii equivalents.
+
+```go
+sanitize.BaseName(s string) string
+```
+
+BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -. Unlike Name no attempt is made to normalise text as a path.
+
+```go
+sanitize.HTML(s string) string
+```
+
+HTML strips html tags with a very simple parser, replace common entities, and escape < and > in the result. The result is intended to be used as plain text. 
+
+```go
+sanitize.HTMLAllowing(s string, args...[]string) (string, error)
+```
+
+HTMLAllowing parses html and allow certain tags and attributes from the lists optionally specified by args - args[0] is a list of allowed tags, args[1] is a list of allowed attributes. If either is missing default sets are used. 
+
+```go
+sanitize.Name(s string) string
+```
+
+Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
+
+```go
+sanitize.Path(s string) string
+```
+
+Path makes a string safe to use as an url path.
+
+
+Changes
+-------
+
+Version 1.2
+
+Adjusted HTML function to avoid linter warning
+Added more tests from https://githubengineering.com/githubs-post-csp-journey/
+Chnaged name of license file
+Added badges and change log to readme
+
+Version 1.1
+Fixed type in comments. 
+Merge pull request from Povilas Balzaravicius Pawka 
+ - replace br tags with newline even when they contain a space
+
+Version 1.0
+First release
--- a/internal/sanitize/sanitize.go
+++ b/internal/sanitize/sanitize.go
@ -0,0 +1,388 @@
+// Package sanitize provides functions for sanitizing text.
+package sanitize
+
+import (
+	"bytes"
+	"html"
+	"html/template"
+	"io"
+	"path"
+	"regexp"
+	"strings"
+
+	parser "golang.org/x/net/html"
+)
+
+var (
+	ignoreTags = []string{"title", "script", "style", "iframe", "frame", "frameset", "noframes", "noembed", "embed", "applet", "object", "base"}
+
+	defaultTags = []string{"h1", "h2", "h3", "h4", "h5", "h6", "div", "span", "hr", "p", "br", "b", "i", "strong", "em", "ol", "ul", "li", "a", "img", "pre", "code", "blockquote", "article", "section"}
+
+	defaultAttributes = []string{"id", "class", "src", "href", "title", "alt", "name", "rel"}
+)
+
+// HTMLAllowing sanitizes html, allowing some tags.
+// Arrays of allowed tags and allowed attributes may optionally be passed as the second and third arguments.
+func HTMLAllowing(s string, args ...[]string) (string, error) {
+
+	allowedTags := defaultTags
+	if len(args) > 0 {
+		allowedTags = args[0]
+	}
+	allowedAttributes := defaultAttributes
+	if len(args) > 1 {
+		allowedAttributes = args[1]
+	}
+
+	// Parse the html
+	tokenizer := parser.NewTokenizer(strings.NewReader(s))
+
+	buffer := bytes.NewBufferString("")
+	ignore := ""
+
+	for {
+		tokenType := tokenizer.Next()
+		token := tokenizer.Token()
+
+		switch tokenType {
+
+		case parser.ErrorToken:
+			err := tokenizer.Err()
+			if err == io.EOF {
+				return buffer.String(), nil
+			}
+			return "", err
+
+		case parser.StartTagToken:
+
+			if len(ignore) == 0 && includes(allowedTags, token.Data) {
+				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
+				buffer.WriteString(token.String())
+			} else if includes(ignoreTags, token.Data) {
+				ignore = token.Data
+			}
+
+		case parser.SelfClosingTagToken:
+
+			if len(ignore) == 0 && includes(allowedTags, token.Data) {
+				token.Attr = cleanAttributes(token.Attr, allowedAttributes)
+				buffer.WriteString(token.String())
+			} else if token.Data == ignore {
+				ignore = ""
+			}
+
+		case parser.EndTagToken:
+			if len(ignore) == 0 && includes(allowedTags, token.Data) {
+				token.Attr = []parser.Attribute{}
+				buffer.WriteString(token.String())
+			} else if token.Data == ignore {
+				ignore = ""
+			}
+
+		case parser.TextToken:
+			// We allow text content through, unless ignoring this entire tag and its contents (including other tags)
+			if ignore == "" {
+				buffer.WriteString(token.String())
+			}
+		case parser.CommentToken:
+			// We ignore comments by default
+		case parser.DoctypeToken:
+			// We ignore doctypes by default - html5 does not require them and this is intended for sanitizing snippets of text
+		default:
+			// We ignore unknown token types by default
+
+		}
+
+	}
+
+}
+
+// HTML strips html tags, replace common entities, and escapes <>&;'" in the result.
+// Note the returned text may contain entities as it is escaped by HTMLEscapeString, and most entities are not translated.
+func HTML(s string) (output string) {
+
+	// Shortcut strings with no tags in them
+	if !strings.ContainsAny(s, "<>") {
+		output = s
+	} else {
+
+		// First remove line breaks etc as these have no meaning outside html tags (except pre)
+		// this means pre sections will lose formatting... but will result in less unintentional paras.
+		s = strings.Replace(s, "\n", "", -1)
+
+		// Then replace line breaks with newlines, to preserve that formatting
+		s = strings.Replace(s, "</p>", "\n", -1)
+		s = strings.Replace(s, "<br>", "\n", -1)
+		s = strings.Replace(s, "</br>", "\n", -1)
+		s = strings.Replace(s, "<br/>", "\n", -1)
+		s = strings.Replace(s, "<br />", "\n", -1)
+
+		// Walk through the string removing all tags
+		b := bytes.NewBufferString("")
+		inTag := false
+		for _, r := range s {
+			switch r {
+			case '<':
+				inTag = true
+			case '>':
+				inTag = false
+			default:
+				if !inTag {
+					b.WriteRune(r)
+				}
+			}
+		}
+		output = b.String()
+	}
+
+	// Remove a few common harmless entities, to arrive at something more like plain text
+	output = strings.Replace(output, "&#8216;", "'", -1)
+	output = strings.Replace(output, "&#8217;", "'", -1)
+	output = strings.Replace(output, "&#8220;", "\"", -1)
+	output = strings.Replace(output, "&#8221;", "\"", -1)
+	output = strings.Replace(output, "&nbsp;", " ", -1)
+	output = strings.Replace(output, "&quot;", "\"", -1)
+	output = strings.Replace(output, "&apos;", "'", -1)
+
+	// Translate some entities into their plain text equivalent (for example accents, if encoded as entities)
+	output = html.UnescapeString(output)
+
+	// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
+	output = template.HTMLEscapeString(output)
+
+	// After processing, remove some harmless entities &, ' and " which are encoded by HTMLEscapeString
+	output = strings.Replace(output, "&#34;", "\"", -1)
+	output = strings.Replace(output, "&#39;", "'", -1)
+	output = strings.Replace(output, "&amp; ", "& ", -1)     // NB space after
+	output = strings.Replace(output, "&amp;amp; ", "& ", -1) // NB space after
+
+	return output
+}
+
+// We are very restrictive as this is intended for ascii url slugs
+var illegalPath = regexp.MustCompile(`[^[:alnum:]\~\-\./]`)
+
+// Path makes a string safe to use as a URL path,
+// removing accents and replacing separators with -.
+// The path may still start at / and is not intended
+// for use as a file system path without prefix.
+func Path(s string) string {
+	// Start with lowercase string
+	filePath := strings.ToLower(s)
+	filePath = strings.Replace(filePath, "..", "", -1)
+	filePath = path.Clean(filePath)
+
+	// Remove illegal characters for paths, flattening accents
+	// and replacing some common separators with -
+	filePath = cleanString(filePath, illegalPath)
+
+	// NB this may be of length 0, caller must check
+	return filePath
+}
+
+// Remove all other unrecognised characters apart from
+var illegalName = regexp.MustCompile(`[^[:alnum:]-.]`)
+
+// Name makes a string safe to use in a file name by first finding the path basename, then replacing non-ascii characters.
+func Name(s string) string {
+	// Start with lowercase string
+	fileName := s
+	fileName = path.Clean(path.Base(fileName))
+
+	// Remove illegal characters for names, replacing some common separators with -
+	fileName = cleanString(fileName, illegalName)
+
+	// NB this may be of length 0, caller must check
+	return fileName
+}
+
+// Replace these separators with -
+var baseNameSeparators = regexp.MustCompile(`[./]`)
+
+// BaseName makes a string safe to use in a file name, producing a sanitized basename replacing . or / with -.
+// No attempt is made to normalise a path or normalise case.
+func BaseName(s string) string {
+
+	// Replace certain joining characters with a dash
+	baseName := baseNameSeparators.ReplaceAllString(s, "-")
+
+	// Remove illegal characters for names, replacing some common separators with -
+	baseName = cleanString(baseName, illegalName)
+
+	// NB this may be of length 0, caller must check
+	return baseName
+}
+
+// A very limited list of transliterations to catch common european names translated to urls.
+// This set could be expanded with at least caps and many more characters.
+var transliterations = map[rune]string{
+	'À': "A",
+	'Á': "A",
+	'Â': "A",
+	'Ã': "A",
+	'Ä': "A",
+	'Å': "AA",
+	'Æ': "AE",
+	'Ç': "C",
+	'È': "E",
+	'É': "E",
+	'Ê': "E",
+	'Ë': "E",
+	'Ì': "I",
+	'Í': "I",
+	'Î': "I",
+	'Ï': "I",
+	'Ð': "D",
+	'Ł': "L",
+	'Ñ': "N",
+	'Ò': "O",
+	'Ó': "O",
+	'Ô': "O",
+	'Õ': "O",
+	'Ö': "OE",
+	'Ø': "OE",
+	'Œ': "OE",
+	'Ù': "U",
+	'Ú': "U",
+	'Ü': "UE",
+	'Û': "U",
+	'Ý': "Y",
+	'Þ': "TH",
+	'ẞ': "SS",
+	'à': "a",
+	'á': "a",
+	'â': "a",
+	'ã': "a",
+	'ä': "ae",
+	'å': "aa",
+	'æ': "ae",
+	'ç': "c",
+	'è': "e",
+	'é': "e",
+	'ê': "e",
+	'ë': "e",
+	'ì': "i",
+	'í': "i",
+	'î': "i",
+	'ï': "i",
+	'ð': "d",
+	'ł': "l",
+	'ñ': "n",
+	'ń': "n",
+	'ò': "o",
+	'ó': "o",
+	'ô': "o",
+	'õ': "o",
+	'ō': "o",
+	'ö': "oe",
+	'ø': "oe",
+	'œ': "oe",
+	'ś': "s",
+	'ù': "u",
+	'ú': "u",
+	'û': "u",
+	'ū': "u",
+	'ü': "ue",
+	'ý': "y",
+	'ÿ': "y",
+	'ż': "z",
+	'þ': "th",
+	'ß': "ss",
+}
+
+// Accents replaces a set of accented characters with ascii equivalents.
+func Accents(s string) string {
+	// Replace some common accent characters
+	b := bytes.NewBufferString("")
+	for _, c := range s {
+		// Check transliterations first
+		if val, ok := transliterations[c]; ok {
+			b.WriteString(val)
+		} else {
+			b.WriteRune(c)
+		}
+	}
+	return b.String()
+}
+
+var (
+	// If the attribute contains data: or javascript: anywhere, ignore it
+	// we don't allow this in attributes as it is so frequently used for xss
+	// NB we allow spaces in the value, and lowercase.
+	illegalAttr = regexp.MustCompile(`(d\s*a\s*t\s*a|j\s*a\s*v\s*a\s*s\s*c\s*r\s*i\s*p\s*t\s*)\s*:`)
+
+	// We are far more restrictive with href attributes.
+	legalHrefAttr = regexp.MustCompile(`\A[/#][^/\\]?|mailto:|http://|https://`)
+)
+
+// cleanAttributes returns an array of attributes after removing malicious ones.
+func cleanAttributes(a []parser.Attribute, allowed []string) []parser.Attribute {
+	if len(a) == 0 {
+		return a
+	}
+
+	var cleaned []parser.Attribute
+	for _, attr := range a {
+		if includes(allowed, attr.Key) {
+
+			val := strings.ToLower(attr.Val)
+
+			// Check for illegal attribute values
+			if illegalAttr.FindString(val) != "" {
+				attr.Val = ""
+			}
+
+			// Check for legal href values - / mailto:// http:// or https://
+			if attr.Key == "href" {
+				if legalHrefAttr.FindString(val) == "" {
+					attr.Val = ""
+				}
+			}
+
+			// If we still have an attribute, append it to the array
+			if attr.Val != "" {
+				cleaned = append(cleaned, attr)
+			}
+		}
+	}
+	return cleaned
+}
+
+// A list of characters we consider separators in normal strings and replace with our canonical separator - rather than removing.
+var (
+	separators = regexp.MustCompile(`[ &_=+:]`)
+
+	dashes = regexp.MustCompile(`[\-]+`)
+)
+
+// cleanString replaces separators with - and removes characters listed in the regexp provided from string.
+// Accents, spaces, and all characters not in A-Za-z0-9 are replaced.
+func cleanString(s string, r *regexp.Regexp) string {
+
+	// Remove any trailing space to avoid ending on -
+	s = strings.Trim(s, " ")
+
+	// Flatten accents first so that if we remove non-ascii we still get a legible name
+	s = Accents(s)
+
+	// Replace certain joining characters with a dash
+	//s = separators.ReplaceAllString(s, "-")
+
+	// Remove all other unrecognised characters - NB we do allow any printable characters
+	//s = r.ReplaceAllString(s, "")
+
+	// Remove any multiple dashes caused by replacements above
+	s = dashes.ReplaceAllString(s, "-")
+
+	return s
+}
+
+// includes checks for inclusion of a string in a []string.
+func includes(a []string, s string) bool {
+	for _, as := range a {
+		if as == s {
+			return true
+		}
+	}
+	return false
+}
--- a/internal/sanitize/sanitize_test.go
+++ b/internal/sanitize/sanitize_test.go
@ -0,0 +1,236 @@
+// Utility functions for working with text
+package sanitize
+
+import (
+	"testing"
+)
+
+var Format = "\ninput:    %q\nexpected: %q\noutput:   %q"
+
+type Test struct {
+	input    string
+	expected string
+}
+
+// NB the treatment of accents - they are removed and replaced with ascii transliterations
+var urls = []Test{
+	{"ReAd ME.md", `read-me.md`},
+	{"E88E08A7-279C-4CC1-8B90-86DE0D7044_3C.html", `e88e08a7-279c-4cc1-8b90-86de0d7044-3c.html`},
+	{"/user/test/I am a long url's_-?ASDF@£$%£%^testé.html", `/user/test/i-am-a-long-urls-asdfteste.html`},
+	{"/../../4-icon.jpg", `/4-icon.jpg`},
+	{"/Images_dir/../4-icon.jpg", `/images-dir/4-icon.jpg`},
+	{"../4 icon.*", `/4-icon.`},
+	{"Spac ey/Nôm/test før url", `spac-ey/nom/test-foer-url`},
+	{"../*", `/`},
+}
+
+func TestPath(t *testing.T) {
+	for _, test := range urls {
+		output := Path(test.input)
+		if output != test.expected {
+			t.Fatalf(Format, test.input, test.expected, output)
+		}
+	}
+}
+
+func BenchmarkPath(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for _, test := range urls {
+			output := Path(test.input)
+			if output != test.expected {
+				b.Fatalf(Format, test.input, test.expected, output)
+			}
+		}
+	}
+}
+
+var fileNames = []Test{
+	{"ReAd ME.md", `read-me.md`},
+	{"/var/etc/jobs/go/go/src/pkg/foo/bar.go", `bar.go`},
+	{"I am a long url's_-?ASDF@£$%£%^é.html", `i-am-a-long-urls-asdfe.html`},
+	{"/../../4-icon.jpg", `4-icon.jpg`},
+	{"/Images/../4-icon.jpg", `4-icon.jpg`},
+	{"../4 icon.jpg", `4-icon.jpg`},
+	{"../4 icon-testé *8%^\"'\".jpg ", `4-icon-teste-8.jpg`},
+	{"Überfluß an Döner macht schöner.JPEG", `ueberfluss-an-doener-macht-schoener.jpeg`},
+	{"Ä-_-Ü_:()_Ö-_-ä-_-ü-_-ö-_ß.webm", `ae-ue-oe-ae-ue-oe-ss.webm`},
+}
+
+func TestName(t *testing.T) {
+	for _, test := range fileNames {
+		output := Name(test.input)
+		if output != test.expected {
+			t.Fatalf(Format, test.input, test.expected, output)
+		}
+	}
+}
+
+func BenchmarkName(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for _, test := range fileNames {
+			output := Name(test.input)
+			if output != test.expected {
+				b.Fatalf(Format, test.input, test.expected, output)
+			}
+		}
+	}
+}
+
+var baseFileNames = []Test{
+	{"The power & the Glory jpg file. The end", `The-power-the-Glory-jpg-file-The-end`},
+	{"/../../4-iCoN.jpg", `-4-iCoN-jpg`},
+	{"And/Or", `And-Or`},
+	{"Sonic.EXE", `Sonic-EXE`},
+	{"012: #Fetch for Defaults", `012-Fetch-for-Defaults`},
+}
+
+func TestBaseName(t *testing.T) {
+	for _, test := range baseFileNames {
+		output := BaseName(test.input)
+		if output != test.expected {
+			t.Fatalf(Format, test.input, test.expected, output)
+		}
+	}
+}
+
+// Test with some malformed or malicious html
+// NB because we remove all tokens after a < until the next >
+// and do not attempt to parse, we should be safe from invalid html,
+// but will sometimes completely empty the string if we have invalid input
+// Note we sometimes use " in order to keep things on one line and use the ` character
+var htmlTests = []Test{
+	{`&nbsp;`, " "},
+	{`&amp;#x000D;`, `&amp;#x000D;`},
+	{`<invalid attr="invalid"<,<p><p><p><p><p>`, ``},
+	{"<b><p>Bold </b> Not bold</p>\nAlso not bold.", "Bold  Not bold\nAlso not bold."},
+	{`FOO&#x000D;ZOO`, "FOO\rZOO"},
+	{`<script><!--<script </s`, ``},
+	{`<a href="/" alt="Fab.com | Aqua Paper Map 22"" title="Fab.com | Aqua Paper Map 22" - fab.com">test</a>`, `test`},
+	{`<p</p>?> or <p id=0</p> or <<</>><ASDF><@$!@£M<<>>>>>>>>>>>>>><>***************aaaaaaaaaaaaaaaaaaaaaaaaaa>`, ` or ***************aaaaaaaaaaaaaaaaaaaaaaaaaa`},
+	{`<p>Some text</p><frameset src="testing.html"></frameset>`, "Some text\n"},
+	{`Something<br/>Some more`, "Something\nSome more"},
+	{`<a href="http://www.example.com"?>This is a 'test' of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.<//data>><alert><script CDATA[:Asdfjk2354115nkjafdgs]>. <div src=">">><><img src="">`, "This is a 'test' of bold & italic \n invalid markup.. \""},
+	{`<![CDATA[<sender>John Smith</sender>]]>`, `John Smith]]`},
+	{`<!-- <script src='blah.js' data-rel='fsd'> --> This is text`, ` -- This is text`},
+	{`<style>body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}</style>`, `body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}`},
+	{`&lt;iframe src="" attr=""&gt;>>>>>`, `&lt;iframe src="" attr=""&gt;`},
+	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `alert("XSS")"`},
+	{`<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, ``},
+	{`<IMG SRC=JaVaScRiPt:alert('XSS')&gt;`, ``},
+	{`<IMG SRC="javascript:alert('XSS')" <test`, ``},
+	{`<a href="javascript:alert('XSS')" src="javascript:alert('XSS')" onclick="javascript:alert('XSS')"></a>`, ``},
+	{`&gt & test &lt`, `&gt; & test &lt;`},
+	{`<img></IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, ``},
+	{`&#8220;hello&#8221; it&#8217;s for &#8216;real&#8217;`, `"hello" it's for 'real'`},
+	{`<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&
+#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>`, ``},
+	{`'';!--"<XSS>=&{()}`, `'';!--"=&amp;{()}`},
+	{"LINE 1<br />\nLINE 2", "LINE 1\nLINE 2"},
+
+	// Examples from https://githubengineering.com/githubs-post-csp-journey/
+	{`<img src='https://example.com/log_csrf?html=`, ``},
+	{`<img src='https://example.com/log_csrf?html=
+<form action="https://example.com/account/public_keys/19023812091023">
+...
+<input type="hidden" name="csrf_token" value="some_csrf_token_value">
+</form>`, `...`},
+	{`<img src='https://example.com?d=https%3A%2F%2Fsome-evil-site.com%2Fimages%2Favatar.jpg%2f
+	<p>secret</p>`, `secret
+`},
+	{`<form action="https://some-evil-site.com"><button>Click</button><textarea name='
+<!-- </textarea> --><!-- '" -->
+<form action="/logout">
+  <input name="authenticity_token" type="hidden" value="secret1">
+</form>`, `Click --  `},
+}
+
+func TestHTML(t *testing.T) {
+	for _, test := range htmlTests {
+		output := HTML(test.input)
+		if output != test.expected {
+			t.Fatalf(Format, test.input, test.expected, output)
+		}
+	}
+}
+
+var htmlTestsAllowing = []Test{
+	{`<IMG SRC="jav&#x0D;ascript:alert('XSS');">`, `<img>`},
+	{`<i>hello world</i href="javascript:alert('hello world')">`, `<i>hello world</i>`},
+	{`hello<br ><br / ><hr /><hr    >rulers`, `hello<br><br><hr/><hr>rulers`},
+	{`<span class="testing" id="testid" name="testname" style="font-color:red;text-size:gigantic;"><p>Span</p></span>`, `<span class="testing" id="testid" name="testname"><p>Span</p></span>`},
+	{`<div class="divclass">Div</div><h4><h3>test</h4>invalid</h3><p>test</p>`, `<div class="divclass">Div</div><h4><h3>test</h4>invalid</h3><p>test</p>`},
+	{`<p>Some text</p><exotic><iframe>test</iframe><frameset src="testing.html"></frameset>`, `<p>Some text</p>`},
+	{`<b>hello world</b>`, `<b>hello world</b>`},
+	{`text<p>inside<p onclick='alert()'/>too`, `text<p>inside<p/>too`},
+	{`&amp;#x000D;`, `&amp;#x000D;`},
+	{`<invalid attr="invalid"<,<p><p><p><p><p>`, `<p><p><p><p>`},
+	{"<b><p>Bold </b> Not bold</p>\nAlso not bold.", "<b><p>Bold </b> Not bold</p>\nAlso not bold."},
+	{"`FOO&#x000D;ZOO", "`FOO&#13;ZOO"},
+	{`<script><!--<script </s`, ``},
+	{`<a href="/" alt="Fab.com | Aqua Paper Map 22"" title="Fab.com | Aqua Paper Map 22" - fab.com">test</a>`, `<a href="/" alt="Fab.com | Aqua Paper Map 22" title="Fab.com | Aqua Paper Map 22">test</a>`},
+	{"<p</p>?> or <p id=0</p> or <<</>><ASDF><@$!@£M<<>>>>>>>>>>>>>><>***************aaaaaaaaaaaaaaaaaaaaaaaaaa>", "?&gt; or <p id=\"0&lt;/p\"> or &lt;&lt;&gt;&lt;@$!@£M&lt;&lt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&gt;&lt;&gt;***************aaaaaaaaaaaaaaaaaaaaaaaaaa&gt;"},
+	{`<p>Some text</p><exotic><iframe><frameset src="testing.html"></frameset>`, `<p>Some text</p>`},
+	{"Something<br/>Some more", `Something<br/>Some more`},
+	{`<a href="http://www.example.com"?>This is a 'test' of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.</data><alert><script CDATA[:Asdfjk2354115nkjafdgs]>. <div src=">escape;inside script tag"><img src="">`, `<a href="http://www.example.com">This is a &#39;test&#39; of <b>bold</b> &amp; <i>italic</i></a> <br/> invalid markup.`},
+	{"<sender ignore=me>John Smith</sender>", `John Smith`},
+	{"<!-- <script src='blah.js' data-rel='fsd'> --> This is text", ` This is text`},
+	{"<style>body{background-image:url(http://www.google.com/intl/en/images/logo.gif);}</style>", ``},
+	{`&lt;iframe src="" attr=""&gt;`, `&lt;iframe src=&#34;&#34; attr=&#34;&#34;&gt;`},
+	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `<img>&#34;&gt;`},
+	{`<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, `<img>`},
+	{`<IMG SRC=JaVaScRiPt:alert('XSS')&gt;`, ``},
+	{`<IMG SRC="javascript:alert('XSS')">>> <test`, `<img>&gt;&gt; `},
+	{`&gt & test &lt`, `&gt; &amp; test &lt;`},
+	{`<img></IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>`, `<img></img>`},
+	{`<img src="data:text/javascript;alert('alert');">`, `<img>`},
+	{`<iframe src=http://... <`, ``},
+	{`<iframe src="data:CSS"><img><a><</a>;sdf<iframe>`, ``},
+	{`<img src=javascript:alert(document.cookie)>`, `<img>`},
+	{`<?php echo('hello world')>`, ``},
+	{`Hello <STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A>World`, `Hello <a class="XSS"></a>World`},
+	{`<a href="javascript:alert('XSS1')" onmouseover="alert('XSS2')">XSS<a>`, `<a>XSS<a>`},
+	{`<a href="http://www.google.com/"><img src="https://ssl.gstatic.com/accounts/ui/logo_2x.png"/></a>`,
+		`<a href="http://www.google.com/"><img src="https://ssl.gstatic.com/accounts/ui/logo_2x.png"/></a>`},
+	{`<a href="javascript:alert(&#39;XSS1&#39;)" "document.write('<HTML> Tags and markup');">XSS<a>`, `<a> Tags and markup&#39;);&#34;&gt;XSS<a>`},
+	{`<a <script>document.write("UNTRUSTED INPUT: " + document.location.hash);<script/> >`, `<a>document.write(&#34;UNTRUSTED INPUT: &#34; + document.location.hash); &gt;`},
+	{`<a href="#anchor">foo</a>`, `<a href="#anchor">foo</a>`},
+	{`<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>`, `<img>`},
+	{`<IMG SRC="jav	ascript:alert('XSS');">`, `<img>`},
+	{`<IMG SRC="jav&#x09;ascript:alert('XSS');">`, `<img>`},
+	{`<HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-`, ` +ADw-SCRIPT+AD4-alert(&#39;XSS&#39;);+ADw-/SCRIPT+AD4-`},
+	{`<SCRIPT>document.write("<SCRI");</SCRIPT>PT SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, `PT SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;`},
+	{`<a href="javascript:alert('XSS')" src="javascript:alert('XSS')" onclick="javascript:alert('XSS')"></a>`, `<a></a>`},
+	{`'';!--"<XSS>=&{()}`, `&#39;&#39;;!--&#34;=&amp;{()}`},
+	{`<IMG SRC=javascript:alert('XSS')`, ``},
+	{`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, `<img>&#34;&gt;`},
+	{`<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&
+#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>`, `<img>`},
+	{`<a href="mailto:cool@test.com?subject=cooool">cool guy</a>`, `<a href="mailto:cool@test.com?subject=cooool">cool guy</a>`},
+}
+
+func TestHTMLAllowed(t *testing.T) {
+
+	for _, test := range htmlTestsAllowing {
+		output, err := HTMLAllowing(test.input)
+		if err != nil {
+			t.Fatalf(Format, test.input, test.expected, output, err)
+		}
+		if output != test.expected {
+			t.Fatalf(Format, test.input, test.expected, output)
+		}
+	}
+}
+
+func BenchmarkHTMLAllowed(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for _, test := range htmlTestsAllowing {
+			output, err := HTMLAllowing(test.input)
+			if err != nil {
+				b.Fatalf(Format, test.input, test.expected, output, err)
+			}
+			if output != test.expected {
+				b.Fatalf(Format, test.input, test.expected, output)
+			}
+		}
+	}
+}
--- a/service/fileService.go
+++ b/service/fileService.go
@ -15,6 +15,7 @@ import (
 	"strconv"
 	"time"

+	"github.com/akhilrex/podgrab/internal/sanitize"
 	stringy "github.com/gobeam/stringy"
 )

@ -185,8 +186,8 @@ func httpClient() *http.Client {
 }

 func createFolder(folder string, parent string) string {
-	str := stringy.New(folder)
-	folder = str.RemoveSpecialCharacter()
+	folder = cleanFileName(folder)
+	//str := stringy.New(folder)
 	folderPath := path.Join(parent, folder)
 	if _, err := os.Stat(folderPath); os.IsNotExist(err) {
 		os.MkdirAll(folderPath, 0777)
@ -197,11 +198,11 @@ func createFolder(folder string, parent string) string {

 func createDataFolderIfNotExists(folder string) string {
 	dataPath := os.Getenv("DATA")
-	return createFolder(folder,dataPath)
+	return createFolder(folder, dataPath)
 }
 func createConfigFolderIfNotExists(folder string) string {
 	dataPath := os.Getenv("CONFIG")
-	return createFolder(folder,dataPath)
+	return createFolder(folder, dataPath)
 }

 func getFileName(link string, title string, defaultExtension string) string {
@ -214,12 +215,16 @@ func getFileName(link string, title string, defaultExtension string) string {
 	if len(ext) == 0 {
 		ext = defaultExtension
 	}
-	str := stringy.New(title)
-	str = stringy.New(str.RemoveSpecialCharacter())
+	//str := stringy.New(title)
+	str := stringy.New(cleanFileName(title))
 	return str.KebabCase().Get() + ext

 }

+func cleanFileName(original string) string {
+	return sanitize.Name(original)
+}
+
 func checkError(err error) {
 	if err != nil {
 		panic(err)