garden/soup.py

243 lines
8.1 KiB
Python
Raw Normal View History

2023-05-20 13:32:59 +02:00
#!/usr/bin/env python3
import glob
import re
import urllib.parse
from collections import defaultdict
from bs4 import BeautifulSoup
2024-10-14 17:55:46 +02:00
UUID_PATTERN = "^[0-9a-f]{8}-[0-9a-f]{4}-[0-5][0-9a-f]{3}-[089ab][0-9a-f]{3}-[0-9a-f]{12}$"
def match_uuid(headline_id: str | None) ->bool:
if headline_id is None:
return False
result = re.match(UUID_PATTERN, headline_id) # Returns Match object
if result is None:
return False
return True
2023-05-20 13:32:59 +02:00
def get_node_link(node_contend: str):
node_soup = BeautifulSoup(node_contend, "lxml")
temp_link = node_soup.find_all(name='a',class_='internal-link')
node_link = set()
for i in temp_link:
i = i.get("href")
if i == "":
continue
node_link.add(i)
return node_link
2023-05-20 13:32:59 +02:00
2023-05-20 13:32:59 +02:00
def new_section(soup):
new_div = soup.new_tag("div")
new_div["class"] = 'bl-section'
new_h4 = soup.new_tag("h4")
new_h4.string = "反向链接"
new_div.append(new_h4)
bl_ul = soup.new_tag("ul")
new_div.append(bl_ul)
return new_div
def soup_link():
files = glob.glob("public/main/*/*.html")
files += glob.glob("public/daily/*/*.html")
files += glob.glob("public/references/*/*.html")
files += glob.glob("public/articles/*/*.html")
2023-05-20 13:32:59 +02:00
_pages_dict = defaultdict(dict)
2023-05-20 13:32:59 +02:00
for file in files:
with open(file, 'r') as f:
soup = BeautifulSoup(f.read(), "lxml")
temp_link = soup.find_all(name='a',class_='internal-link')
page_link = set()
2023-05-20 13:32:59 +02:00
for i in temp_link:
i = i.get("href")
page_link.add(i)
2023-05-20 13:32:59 +02:00
2024-10-14 17:55:46 +02:00
_nodes = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
nodes = [i for i in _nodes if match_uuid(i.get("id"))]
2023-05-20 13:32:59 +02:00
nodes_dict = {}
file_name = file.split('/')[-2]
for node in nodes:
node_dict = {}
2024-10-14 17:55:46 +02:00
node_element = node
2023-05-20 13:32:59 +02:00
node_dict["name"] = node_element.contents[0]
iii = node_element.find_next_sibling()
node_content = ''
while node_element.name != iii.name:
node_content += str(iii)
iii = iii.find_next_sibling()
if iii == None:
2023-05-20 13:32:59 +02:00
break
node_id_link = '/' + file_name + '/#' +node_element["id"]
node_dict["links"] = get_node_link(node_content)
2023-05-20 13:32:59 +02:00
node_dict["id-link"] = node_id_link
node_dict["backlinks"] = []
nodes_dict[node_element["id"]] = node_dict
for i in nodes_dict:
for node_link in nodes_dict[i]["links"]:
if node_link in page_link:
page_link.remove(node_link)
_pages_dict[file_name]["name"] = soup.title.string
_pages_dict[file_name]["path"] = file
_pages_dict[file_name]["links"] = page_link
_pages_dict[file_name]["backlinks"] = []
_pages_dict[file_name]["nodes"] = nodes_dict
2023-05-20 13:32:59 +02:00
return _pages_dict
def build_backlinks_dict(pages_dict):
backlinks_dict = pages_dict
for page in pages_dict:
for _link in pages_dict[page]["links"]:
if _link == "":
continue
link = urllib.parse.unquote(_link).split("/")
_filter = [
i
for i in backlinks_dict
if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html"
]
if len(_filter) == 0:
continue
post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0]
2024-10-14 17:55:46 +02:00
backlinks_dict[_filter[0]]["backlinks"].append(post_link)
2023-05-20 13:32:59 +02:00
for node in pages_dict[page]["nodes"]:
for node_link in pages_dict[page]["nodes"][node]["links"]:
node_link = urllib.parse.unquote(node_link)
node_link = node_link.split('/')
if len(node_link) == 1:
link_node = node_link[0][1:]
node_link = '#' + node
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
elif len(node_link) > 2:
2023-05-20 13:32:59 +02:00
# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
if node_link[-2] == page:
2023-05-20 13:32:59 +02:00
link_node = link[-1][1:]
node_link = node_link[-1]
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
2023-05-20 13:32:59 +02:00
else:
if node_link[-1].startswith('#'):
link_page = node_link[-2]
link_node = node_link[-1][1:]
2023-05-20 13:32:59 +02:00
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
else:
link_page = node_link[-2]
2023-05-20 13:32:59 +02:00
backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
return backlinks_dict
2023-05-20 13:32:59 +02:00
def write_backlinks(backlinks_dict):
def parse_page_backlinks(i):
2023-05-20 13:32:59 +02:00
backlink_filename = i.split('/')[-2]
backlink_nodename = i.split('/')[-1][1:]
try:
backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
except KeyError:
_filter = [
x
for x in backlinks_dict
if backlinks_dict[x]["path"] == f"public{i}index.html"
][0]
backlink_name = backlinks_dict[_filter]["name"]
2023-05-20 13:32:59 +02:00
new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=i)
new_a.string = backlink_name
new_li.append(new_a)
bl_section = soup.find(class_="bl-section").find(name="ul")
bl_section.append(new_li)
2023-05-20 13:32:59 +02:00
def parse_node_backlink(i):
2023-05-20 13:32:59 +02:00
new_bl = new_section(soup)
for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
link = ii.split("/")
if link[-1].startswith("#"):
backlink_nodename = link[0][1:]
backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][
"name"
].rstrip()
# print(backlink_name)
else:
backlink_nodename = link[2]
backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip()
# if len(link) > 1:
# backlink_pagename = link[2]
# backlink_nodename = link[3][1:]
# print(link)
# print(page)
# backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][
# "name"
# ].rstrip()
2023-05-20 13:32:59 +02:00
new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=ii)
new_a.string = backlink_name
new_li.append(new_a)
new_bl.append(new_li)
bl_section = soup.find(id=i)
iii = bl_section.find_next_sibling()
heading = int(bl_section.name[1:])
while (True):
if iii.find_next_sibling() == None:
iii.append(new_bl)
break
if iii.name.startswith('h') and (int(iii.name[1:]) <= heading):
iii.find_previous_sibling().append(new_bl)
break
iii = iii.find_next_sibling()
for page in backlinks_dict:
try:
with open(backlinks_dict[page]["path"], 'r') as f:
soup = BeautifulSoup(f.read(), "lxml")
except FileNotFoundError as e:
print('except:', e)
continue
content_section = soup.find(class_="content")
content_section.insert_after(new_section(soup))
for i in backlinks_dict[page]["backlinks"]:
parse_page_backlinks(i)
for i in backlinks_dict[page]["nodes"]:
if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
continue
parse_node_backlink(i)
2024-06-19 05:14:43 +02:00
with open(backlinks_dict[page]["path"], "w", encoding="utf-8") as f:
2023-05-20 13:32:59 +02:00
f.write(str(soup))
print("write " + page + '!')
pages_dict = soup_link()
backlinks_dict = build_backlinks_dict(pages_dict)
write_backlinks(backlinks_dict)