garden/soup.py
SouthFox 08db36282d
All checks were successful
continuous-integration/drone Build is passing
new init
2023-05-20 19:32:59 +08:00

198 lines
6.9 KiB
Python

#!/usr/bin/env python3
import glob
import re
import urllib.parse
from collections import defaultdict
from bs4 import BeautifulSoup
def re_link(link):
urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)
return urls
def new_section(soup):
new_div = soup.new_tag("div")
new_div["class"] = 'bl-section'
new_h4 = soup.new_tag("h4")
new_h4.string = "反向链接"
new_div.append(new_h4)
bl_div = soup.new_tag("div")
bl_div["class"] = 'backlinks'
new_div.append(bl_div)
bl_ul = soup.new_tag("ul")
new_div.append(bl_ul)
return new_div
def soup_link():
files = glob.glob("public/posts/*/*.html")
pages_dict = defaultdict(dict)
for file in files:
with open(file, 'r') as f:
soup = BeautifulSoup(f.read(), "lxml")
temp_link = soup.find_all(name='a',class_='internal-link')
page_link = []
for i in temp_link:
i = i.get("href")
page_link.append(i)
nodes = soup.find_all(class_ = "NODE")
nodes_dict = {}
file_name = file.split('/')[-2]
for node in nodes:
node_dict = {}
node_element = node.parent.parent
node_dict["name"] = node_element.contents[0]
iii = node_element.find_next_sibling()
node_content = ''
while node_element.name != iii.name:
node_content += str(iii)
iii = iii.find_next_sibling()
if iii == None or (iii.name == 'div' and '🔗反向链接' in iii.text):
break
node_id_link = '/' + file_name + '/#' +node_element["id"]
node_dict["links"] = re_link(node_content)
node_dict["id-link"] = node_id_link
node_dict["backlinks"] = []
nodes_dict[node_element["id"]] = node_dict
for i in nodes_dict:
for node_link in nodes_dict[i]["links"]:
if node_link in page_link:
page_link.remove(node_link)
pages_dict[file_name]["name"] = soup.title.string
pages_dict[file_name]["links"] = page_link
pages_dict[file_name]["backlinks"] = []
pages_dict[file_name]["nodes"] = nodes_dict
return pages_dict
pages_dict = soup_link()
backlinks_dict = {}
backlinks_dict = pages_dict
pages_dict = dict(pages_dict)
for page in pages_dict:
try:
for link in pages_dict[page]["links"]:
link = urllib.parse.unquote(link)
if '#' in link:
link = link.split('/')
if len(link) > 2:
# ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']
link_page = link[-2]
link_node = link[-1][1:]
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(page)
for node in pages_dict[page]["nodes"]:
for link in pages_dict[page]["nodes"][node]["links"]:
link = urllib.parse.unquote(link)
link = link.split('/')
if len(link) == 1:
link_node = link[0][1:]
link = '#' + node
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link)
elif len(link) > 2:
# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
if link[-2] == page:
link_node = link[-1][1:]
link = link[-1]
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link)
else:
if link[-1].startswith('#'):
link_page = link[-2]
link_node = link_node = link[-1][1:]
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
else:
link_page = link[-2]
backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
except KeyError as e:
print('except:', e)
del(pages_dict)
for page in backlinks_dict:
write_flag = 0
try:
with open('public/posts/' + page + '/index.html', 'r+') as f:
soup = BeautifulSoup(f.read(), "lxml")
except FileNotFoundError as e:
print('except:', e)
continue
for i in backlinks_dict[page]["backlinks"]:
backlink_filename = i.split('/')[-2]
backlink_nodename = i.split('/')[-1][1:]
backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
i = "/posts" + i
new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=i)
new_a.string = backlink_name
new_li.append(new_a)
bl_section = soup.find(class_="bl-section")
if bl_section != None:
bl_section.find_next("ul").append(new_li)
write_flag = 1
for i in backlinks_dict[page]["nodes"]:
if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
continue
new_bl = new_section(soup)
for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
link = ii.split('/')
if len(link) == 1:
if link[0].startswith('#'):
backlink_nodename = link[0][1:]
backlink_name = backlinks_dict[page]["nodes"][backlink_nodename]["name"].rstrip()
else:
backlink_nodename = link[0]
backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() #FIXME
ii = '/posts/' + backlink_nodename + '/'
if len(link) > 1:
backlink_pagename = link[1]
backlink_nodename = link[2][1:]
backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename]["name"].rstrip()
new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=ii)
new_a.string = backlink_name
new_li.append(new_a)
new_bl.append(new_li)
bl_section = soup.find(id=i)
iii = bl_section.find_next_sibling()
heading = int(bl_section.name[1:])
while (True):
if iii.name == 'div' and '🔗反向链接' in iii.text:
iii.find_previous_sibling().append(new_bl)
break
if iii.find_next_sibling() == None:
iii.append(new_bl)
break
if iii.name.startswith('h') and (int(iii.name[1:]) <= heading):
iii.find_previous_sibling().append(new_bl)
break
iii = iii.find_next_sibling()
write_flag = 1
if write_flag == 1:
with open('public/posts/' + page + '/index.html', 'r+') as f:
f.write(str(soup))
print("write " + page + '!')