gnu-maintenance: ‘generic-html’ update honors <base href="…">.

This fixes updates of ‘curl’: <https://curl.se/download/> includes
<base href="…"> in its head and ignoring it would lead to incorrect
download URLs.

* guix/gnu-maintenance.scm (html-links): Keep track of <base href="…">
in ‘loop’.  Rewrite relative links at the end.

Change-Id: I989da78df3431034c9a584f8e10cad87ae6dc920
This commit is contained in:
Ludovic Courtès 2024-11-28 23:20:00 +01:00
parent db59509931
commit 5d6691d33e
No known key found for this signature in database
GPG key ID: 090B11993D9AEBB5

View file

@ -39,6 +39,7 @@ (define-module (guix gnu-maintenance)
#:use-module (guix utils)
#:use-module (guix diagnostics)
#:use-module (guix i18n)
#:autoload (guix combinators) (fold2)
#:use-module (guix memoization)
#:use-module (guix records)
#:use-module (guix upstream)
@ -483,19 +484,33 @@ (define* (import-release* package #:key (version #f))
(define (html-links sxml)
"Return the list of links found in SXML, the SXML tree of an HTML page."
(let loop ((sxml sxml)
(links '()))
(match sxml
(('a ('@ attributes ...) body ...)
(match (assq 'href attributes)
(#f (fold loop links body))
(('href url) (fold loop (cons url links) body))))
((tag ('@ _ ...) body ...)
(fold loop links body))
((tag body ...)
(fold loop links body))
(_
links))))
(define-values (links base)
(let loop ((sxml sxml)
(links '())
(base #f))
(match sxml
(('a ('@ attributes ...) body ...)
(match (assq 'href attributes)
(#f (fold2 loop links base body))
(('href url) (fold2 loop (cons url links) base body))))
(('base ('@ ('href new-base)))
;; The base against which relative URL paths must be resolved.
(values links new-base))
((tag ('@ _ ...) body ...)
(fold2 loop links base body))
((tag body ...)
(fold2 loop links base body))
(_
(values links base)))))
(if base
(map (lambda (link)
(let ((uri (string->uri link)))
(if (or uri (string-prefix? "/" link))
link
(in-vicinity base link))))
links)
links))
(define (url->links url)
"Return the unique links on the HTML page accessible at URL."