From d847b10e32270c3cf7bafe6e2914e82bda39a924 Mon Sep 17 00:00:00 2001 From: Dave Z Date: Sat, 23 Jun 2018 20:50:43 -0400 Subject: [PATCH] Improve sanitizer to remove script and noscript contents These tags where removed but the content was rendered as escaped HTML. See #157 --- reader/sanitizer/sanitizer.go | 13 +++++++++++++ reader/sanitizer/sanitizer_test.go | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/reader/sanitizer/sanitizer.go b/reader/sanitizer/sanitizer.go index 28539118..d7a4626a 100644 --- a/reader/sanitizer/sanitizer.go +++ b/reader/sanitizer/sanitizer.go @@ -25,6 +25,7 @@ func Sanitize(baseURL, input string) string { tokenizer := html.NewTokenizer(bytes.NewBufferString(input)) var buffer bytes.Buffer var tagStack []string + scriptTagDepth := 0 for { if tokenizer.Next() == html.ErrorToken { @@ -39,6 +40,10 @@ func Sanitize(baseURL, input string) string { token := tokenizer.Token() switch token.Type { case html.TextToken: + if scriptTagDepth > 0 { + continue + } + buffer.WriteString(html.EscapeString(token.Data)) case html.StartTagToken: tagName := token.DataAtom.String() @@ -55,11 +60,15 @@ func Sanitize(baseURL, input string) string { tagStack = append(tagStack, tagName) } + } else if isScriptTag(tagName) { + scriptTagDepth++ } case html.EndTagToken: tagName := token.DataAtom.String() if isValidTag(tagName) && inList(tagName, tagStack) { buffer.WriteString(fmt.Sprintf("", tagName)) + } else if isScriptTag(tagName) { + scriptTagDepth-- } case html.SelfClosingTagToken: tagName := token.DataAtom.String() @@ -384,3 +393,7 @@ func rewriteIframeURL(link string) string { return link } + +func isScriptTag(tagName string) bool { + return tagName == "script" || tagName == "noscript" +} diff --git a/reader/sanitizer/sanitizer_test.go b/reader/sanitizer/sanitizer_test.go index 6eb9b0d4..fa7dd6d9 100644 --- a/reader/sanitizer/sanitizer_test.go +++ b/reader/sanitizer/sanitizer_test.go @@ -212,3 +212,23 @@ func TestReplaceIframeURL(t *testing.T) { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } + +func TestReplaceNoScript(t *testing.T) { + input := `

Before paragraph.

After paragraph.

` + expected := `

Before paragraph.

After paragraph.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +} + +func TestReplaceScript(t *testing.T) { + input := `

Before paragraph.

After paragraph.

` + expected := `

Before paragraph.

After paragraph.

` + output := Sanitize("http://example.org/", input) + + if expected != output { + t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) + } +}