From 9c4411eaa2a5aeace02e5253afd6c8a7cd4434f2 Mon Sep 17 00:00:00 2001 From: SouthFox Date: Wed, 9 Oct 2024 00:57:49 +0800 Subject: [PATCH] [fix] do not get external link for node --- soup.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/soup.py b/soup.py index 0413a94..349b0ee 100644 --- a/soup.py +++ b/soup.py @@ -7,9 +7,19 @@ from bs4 import BeautifulSoup -def re_link(link): - urls = re.findall(r'href=[\'"]?([^\'" >]+)', link) - return urls +def get_node_link(node_contend: str): + node_soup = BeautifulSoup(node_contend, "lxml") + + temp_link = node_soup.find_all(name='a',class_='internal-link') + node_link = set() + for i in temp_link: + i = i.get("href") + + if i == "": + continue + + node_link.add(i) + return node_link def new_section(soup): @@ -59,7 +69,7 @@ def soup_link(): break node_id_link = '/' + file_name + '/#' +node_element["id"] - node_dict["links"] = re_link(node_content) + node_dict["links"] = get_node_link(node_content) node_dict["id-link"] = node_id_link node_dict["backlinks"] = [] nodes_dict[node_element["id"]] = node_dict