sanitizer: add support for HTML hidden attribute

This commit adjusts the `Sanitize` function to skip tags with the `hidden` attribute, similar to how it skips blocked tags and their contents.
2024-06-19 20:38:24 +02:00 · 2024-06-19 20:38:24 +02:00 · ee5e18ea9f
commit ee5e18ea9f
parent 3ef2522c62
2 changed files with 20 additions and 9 deletions
--- a/internal/reader/sanitizer/sanitizer.go
+++ b/internal/reader/sanitizer/sanitizer.go
@ -82,7 +82,7 @@ func Sanitize(baseURL, input string) string {
 	var buffer strings.Builder
 	var tagStack []string
 	var parentTag string
-	blacklistedTagDepth := 0
+	var blockedStack []string
 	tokenizer := html.NewTokenizer(strings.NewReader(input))
 	for {
@ -98,7 +98,7 @@ func Sanitize(baseURL, input string) string {
 		token := tokenizer.Token()
 		switch token.Type {
 		case html.TextToken:
-			if blacklistedTagDepth > 0 {
+			if len(blockedStack) > 0 {
 				continue
 			}
@ -116,7 +116,10 @@ func Sanitize(baseURL, input string) string {
 			if isPixelTracker(tagName, token.Attr) {
 				continue
 			}
-			if isValidTag(tagName) {
+
 			if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
 				blockedStack = append(blockedStack, tagName)
 			} else if len(blockedStack) == 0 && isValidTag(tagName) {
 				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
 				if hasRequiredAttributes(tagName, attrNames) {
@ -128,22 +131,20 @@ func Sanitize(baseURL, input string) string {
 					tagStack = append(tagStack, tagName)
 				}
 			} else if isBlockedTag(tagName) {
 				blacklistedTagDepth++
 			}
 		case html.EndTagToken:
 			tagName := token.DataAtom.String()
-			if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
+			if len(blockedStack) > 0 && blockedStack[len(blockedStack)-1] == tagName {
 				blockedStack = blockedStack[:len(blockedStack)-1]
 			} else if len(blockedStack) == 0 && isValidTag(tagName) && slices.Contains(tagStack, tagName) {
 				buffer.WriteString("</" + tagName + ">")
 			} else if isBlockedTag(tagName) {
 				blacklistedTagDepth--
 			}
 		case html.SelfClosingTagToken:
 			tagName := token.DataAtom.String()
 			if isPixelTracker(tagName, token.Attr) {
 				continue
 			}
-			if isValidTag(tagName) {
+			if isValidTag(tagName) && len(blockedStack) == 0 {
 				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
 				if hasRequiredAttributes(tagName, attrNames) {
 					if len(attrNames) > 0 {
--- a/internal/reader/sanitizer/sanitizer_test.go
+++ b/internal/reader/sanitizer/sanitizer_test.go
@ -630,3 +630,13 @@ func TestReplaceStyle(t *testing.T) {
 		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
 	}
 }
 func TestHiddenParagraph(t *testing.T) {
 	input := `<p>Before paragraph.</p><p hidden>This should <em>not</em> appear in the <strong>output</strong></p><p>After paragraph.</p>`
 	expected := `<p>Before paragraph.</p><p>After paragraph.</p>`
 	output := Sanitize("http://example.org/", input)
 	if expected != output {
 		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
 	}
 }