[feat] build backlink section by python
All checks were successful
/ deploy (push) Successful in 1m18s

This commit is contained in:
SouthFox 2024-06-04 22:29:50 +08:00
parent cb37e6a77b
commit e2b239d705
2 changed files with 125 additions and 95 deletions

182
soup.py
View file

@ -11,15 +11,13 @@ def re_link(link):
urls = re.findall(r'href=[\'"]?([^\'" >]+)', link) urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)
return urls return urls
def new_section(soup): def new_section(soup):
new_div = soup.new_tag("div") new_div = soup.new_tag("div")
new_div["class"] = 'bl-section' new_div["class"] = 'bl-section'
new_h4 = soup.new_tag("h4") new_h4 = soup.new_tag("h4")
new_h4.string = "反向链接" new_h4.string = "反向链接"
new_div.append(new_h4) new_div.append(new_h4)
bl_div = soup.new_tag("div")
bl_div["class"] = 'backlinks'
new_div.append(bl_div)
bl_ul = soup.new_tag("ul") bl_ul = soup.new_tag("ul")
new_div.append(bl_ul) new_div.append(bl_ul)
@ -27,18 +25,21 @@ def new_section(soup):
def soup_link(): def soup_link():
files = glob.glob("public/posts/*/*.html") files = glob.glob("public/main/*/*.html")
files += glob.glob("public/daily/*/*.html")
files += glob.glob("public/references/*/*.html")
files += glob.glob("public/articles/*/*.html")
pages_dict = defaultdict(dict) _pages_dict = defaultdict(dict)
for file in files: for file in files:
with open(file, 'r') as f: with open(file, 'r') as f:
soup = BeautifulSoup(f.read(), "lxml") soup = BeautifulSoup(f.read(), "lxml")
temp_link = soup.find_all(name='a',class_='internal-link') temp_link = soup.find_all(name='a',class_='internal-link')
page_link = [] page_link = set()
for i in temp_link: for i in temp_link:
i = i.get("href") i = i.get("href")
page_link.append(i) page_link.add(i)
nodes = soup.find_all(class_ = "NODE") nodes = soup.find_all(class_ = "NODE")
nodes_dict = {} nodes_dict = {}
@ -54,7 +55,7 @@ def soup_link():
while node_element.name != iii.name: while node_element.name != iii.name:
node_content += str(iii) node_content += str(iii)
iii = iii.find_next_sibling() iii = iii.find_next_sibling()
if iii == None or (iii.name == 'div' and '🔗反向链接' in iii.text): if iii == None:
break break
node_id_link = '/' + file_name + '/#' +node_element["id"] node_id_link = '/' + file_name + '/#' +node_element["id"]
@ -68,102 +69,116 @@ def soup_link():
if node_link in page_link: if node_link in page_link:
page_link.remove(node_link) page_link.remove(node_link)
pages_dict[file_name]["name"] = soup.title.string _pages_dict[file_name]["name"] = soup.title.string
pages_dict[file_name]["links"] = page_link _pages_dict[file_name]["path"] = file
pages_dict[file_name]["backlinks"] = [] _pages_dict[file_name]["links"] = page_link
pages_dict[file_name]["nodes"] = nodes_dict _pages_dict[file_name]["backlinks"] = []
_pages_dict[file_name]["nodes"] = nodes_dict
return pages_dict return _pages_dict
pages_dict = soup_link()
backlinks_dict = {} def build_backlinks_dict(pages_dict):
backlinks_dict = pages_dict backlinks_dict = pages_dict
pages_dict = dict(pages_dict)
for page in pages_dict: for page in pages_dict:
try: for _link in pages_dict[page]["links"]:
for link in pages_dict[page]["links"]: if _link == "":
link = urllib.parse.unquote(link) continue
if '#' in link:
link = link.split('/') link = urllib.parse.unquote(_link).split("/")
_filter = [
i
for i in backlinks_dict
if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html"
]
if len(_filter) == 0:
continue
post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0]
if "#" in link[-1]:
if len(link) > 2: if len(link) > 2:
# ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39'] # ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']
link_page = link[-2] link_page = link[-2]
link_node = link[-1][1:] link_node = link[-1][1:]
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(page) backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link)
else:
backlinks_dict[_filter[0]]["backlinks"].append(post_link)
for node in pages_dict[page]["nodes"]: for node in pages_dict[page]["nodes"]:
for link in pages_dict[page]["nodes"][node]["links"]: for node_link in pages_dict[page]["nodes"][node]["links"]:
link = urllib.parse.unquote(link) node_link = urllib.parse.unquote(node_link)
link = link.split('/') node_link = node_link.split('/')
if len(link) == 1: if len(node_link) == 1:
link_node = link[0][1:] link_node = node_link[0][1:]
link = '#' + node node_link = '#' + node
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
elif len(link) > 2: elif len(node_link) > 2:
# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9'] # ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
if link[-2] == page: if node_link[-2] == page:
link_node = link[-1][1:] link_node = link[-1][1:]
link = link[-1] node_link = node_link[-1]
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
else: else:
if link[-1].startswith('#'): if node_link[-1].startswith('#'):
link_page = link[-2] link_page = node_link[-2]
link_node = link_node = link[-1][1:] link_node = node_link[-1][1:]
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
else: else:
link_page = link[-2] link_page = node_link[-2]
backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
except KeyError as e:
print('except:', e)
del(pages_dict) return backlinks_dict
for page in backlinks_dict:
write_flag = 0
try:
with open('public/posts/' + page + '/index.html', 'r+') as f:
soup = BeautifulSoup(f.read(), "lxml")
except FileNotFoundError as e:
print('except:', e)
continue
for i in backlinks_dict[page]["backlinks"]: def write_backlinks(backlinks_dict):
def parse_page_backlinks(i):
backlink_filename = i.split('/')[-2] backlink_filename = i.split('/')[-2]
backlink_nodename = i.split('/')[-1][1:] backlink_nodename = i.split('/')[-1][1:]
try:
backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip() backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
i = "/posts" + i except KeyError:
_filter = [
x
for x in backlinks_dict
if backlinks_dict[x]["path"] == f"public{i}index.html"
][0]
backlink_name = backlinks_dict[_filter]["name"]
new_li = soup.new_tag("li") new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=i) new_a = soup.new_tag("a", href=i)
new_a.string = backlink_name new_a.string = backlink_name
new_li.append(new_a) new_li.append(new_a)
bl_section = soup.find(class_="bl-section") bl_section = soup.find(class_="bl-section").find(name="ul")
if bl_section != None: bl_section.append(new_li)
bl_section.find_next("ul").append(new_li)
write_flag = 1
for i in backlinks_dict[page]["nodes"]:
if backlinks_dict[page]["nodes"][i]["backlinks"] == []: def parse_node_backlink(i):
continue
new_bl = new_section(soup) new_bl = new_section(soup)
for ii in backlinks_dict[page]["nodes"][i]["backlinks"]: for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
link = ii.split('/') link = ii.split("/")
if len(link) == 1:
if link[0].startswith('#'): if link[-1].startswith("#"):
backlink_nodename = link[0][1:] backlink_nodename = link[0][1:]
backlink_name = backlinks_dict[page]["nodes"][backlink_nodename]["name"].rstrip() backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][
"name"
].rstrip()
# print(backlink_name)
else: else:
backlink_nodename = link[0] backlink_nodename = link[2]
backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() #FIXME backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip()
ii = '/posts/' + backlink_nodename + '/' # if len(link) > 1:
if len(link) > 1: # backlink_pagename = link[2]
backlink_pagename = link[1] # backlink_nodename = link[3][1:]
backlink_nodename = link[2][1:] # print(link)
backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename]["name"].rstrip() # print(page)
# backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][
# "name"
# ].rstrip()
new_li = soup.new_tag("li") new_li = soup.new_tag("li")
new_a = soup.new_tag("a", href=ii) new_a = soup.new_tag("a", href=ii)
@ -176,9 +191,6 @@ for page in backlinks_dict:
heading = int(bl_section.name[1:]) heading = int(bl_section.name[1:])
while (True): while (True):
if iii.name == 'div' and '🔗反向链接' in iii.text:
iii.find_previous_sibling().append(new_bl)
break
if iii.find_next_sibling() == None: if iii.find_next_sibling() == None:
iii.append(new_bl) iii.append(new_bl)
break break
@ -187,12 +199,30 @@ for page in backlinks_dict:
break break
iii = iii.find_next_sibling() iii = iii.find_next_sibling()
write_flag = 1 for page in backlinks_dict:
try:
with open(backlinks_dict[page]["path"], 'r') as f:
soup = BeautifulSoup(f.read(), "lxml")
except FileNotFoundError as e:
print('except:', e)
continue
if write_flag == 1: content_section = soup.find(class_="content")
with open('public/posts/' + page + '/index.html', 'r+') as f: content_section.insert_after(new_section(soup))
for i in backlinks_dict[page]["backlinks"]:
parse_page_backlinks(i)
for i in backlinks_dict[page]["nodes"]:
if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
continue
parse_node_backlink(i)
with open(backlinks_dict[page]["path"], 'r+') as f:
f.write(str(soup)) f.write(str(soup))
print("write " + page + '!') print("write " + page + '!')
pages_dict = soup_link()
backlinks_dict = build_backlinks_dict(pages_dict)
write_backlinks(backlinks_dict)

View file

@ -5,7 +5,7 @@
<div class="content"> <div class="content">
<h1>{{ .Title }}</h1> <h1>{{ .Title }}</h1>
{{ .Content }} {{ .Content }}
{{ partial "backlinks.html" . }} <!-- {{ partial "backlinks.html" . }} -->
</div> </div>
{{ partial "comment.html" . }} {{ partial "comment.html" . }}
</div> </div>