sanitizer: add support for HTML hidden attribute

This commit adjusts the `Sanitize` function to skip tags with the
`hidden` attribute, similar to how it skips blocked tags and their
contents.
This commit is contained in:
JohnnyJayJay 2024-06-19 20:38:24 +02:00 committed by Frédéric Guillot
parent 3ef2522c62
commit ee5e18ea9f
2 changed files with 20 additions and 9 deletions

View file

@ -82,7 +82,7 @@ func Sanitize(baseURL, input string) string {
var buffer strings.Builder var buffer strings.Builder
var tagStack []string var tagStack []string
var parentTag string var parentTag string
blacklistedTagDepth := 0 var blockedStack []string
tokenizer := html.NewTokenizer(strings.NewReader(input)) tokenizer := html.NewTokenizer(strings.NewReader(input))
for { for {
@ -98,7 +98,7 @@ func Sanitize(baseURL, input string) string {
token := tokenizer.Token() token := tokenizer.Token()
switch token.Type { switch token.Type {
case html.TextToken: case html.TextToken:
if blacklistedTagDepth > 0 { if len(blockedStack) > 0 {
continue continue
} }
@ -116,7 +116,10 @@ func Sanitize(baseURL, input string) string {
if isPixelTracker(tagName, token.Attr) { if isPixelTracker(tagName, token.Attr) {
continue continue
} }
if isValidTag(tagName) {
if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
blockedStack = append(blockedStack, tagName)
} else if len(blockedStack) == 0 && isValidTag(tagName) {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) { if hasRequiredAttributes(tagName, attrNames) {
@ -128,22 +131,20 @@ func Sanitize(baseURL, input string) string {
tagStack = append(tagStack, tagName) tagStack = append(tagStack, tagName)
} }
} else if isBlockedTag(tagName) {
blacklistedTagDepth++
} }
case html.EndTagToken: case html.EndTagToken:
tagName := token.DataAtom.String() tagName := token.DataAtom.String()
if isValidTag(tagName) && slices.Contains(tagStack, tagName) { if len(blockedStack) > 0 && blockedStack[len(blockedStack)-1] == tagName {
blockedStack = blockedStack[:len(blockedStack)-1]
} else if len(blockedStack) == 0 && isValidTag(tagName) && slices.Contains(tagStack, tagName) {
buffer.WriteString("</" + tagName + ">") buffer.WriteString("</" + tagName + ">")
} else if isBlockedTag(tagName) {
blacklistedTagDepth--
} }
case html.SelfClosingTagToken: case html.SelfClosingTagToken:
tagName := token.DataAtom.String() tagName := token.DataAtom.String()
if isPixelTracker(tagName, token.Attr) { if isPixelTracker(tagName, token.Attr) {
continue continue
} }
if isValidTag(tagName) { if isValidTag(tagName) && len(blockedStack) == 0 {
attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr) attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
if hasRequiredAttributes(tagName, attrNames) { if hasRequiredAttributes(tagName, attrNames) {
if len(attrNames) > 0 { if len(attrNames) > 0 {

View file

@ -630,3 +630,13 @@ func TestReplaceStyle(t *testing.T) {
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
} }
} }
func TestHiddenParagraph(t *testing.T) {
input := `<p>Before paragraph.</p><p hidden>This should <em>not</em> appear in the <strong>output</strong></p><p>After paragraph.</p>`
expected := `<p>Before paragraph.</p><p>After paragraph.</p>`
output := Sanitize("http://example.org/", input)
if expected != output {
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
}
}