[feat] build backlink section by python
All checks were successful
/ deploy (push) Successful in 1m18s
All checks were successful
/ deploy (push) Successful in 1m18s
This commit is contained in:
parent
cb37e6a77b
commit
e2b239d705
2 changed files with 125 additions and 95 deletions
182
soup.py
182
soup.py
|
@ -11,15 +11,13 @@ def re_link(link):
|
||||||
urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)
|
urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def new_section(soup):
|
def new_section(soup):
|
||||||
new_div = soup.new_tag("div")
|
new_div = soup.new_tag("div")
|
||||||
new_div["class"] = 'bl-section'
|
new_div["class"] = 'bl-section'
|
||||||
new_h4 = soup.new_tag("h4")
|
new_h4 = soup.new_tag("h4")
|
||||||
new_h4.string = "反向链接"
|
new_h4.string = "反向链接"
|
||||||
new_div.append(new_h4)
|
new_div.append(new_h4)
|
||||||
bl_div = soup.new_tag("div")
|
|
||||||
bl_div["class"] = 'backlinks'
|
|
||||||
new_div.append(bl_div)
|
|
||||||
bl_ul = soup.new_tag("ul")
|
bl_ul = soup.new_tag("ul")
|
||||||
new_div.append(bl_ul)
|
new_div.append(bl_ul)
|
||||||
|
|
||||||
|
@ -27,18 +25,21 @@ def new_section(soup):
|
||||||
|
|
||||||
|
|
||||||
def soup_link():
|
def soup_link():
|
||||||
files = glob.glob("public/posts/*/*.html")
|
files = glob.glob("public/main/*/*.html")
|
||||||
|
files += glob.glob("public/daily/*/*.html")
|
||||||
|
files += glob.glob("public/references/*/*.html")
|
||||||
|
files += glob.glob("public/articles/*/*.html")
|
||||||
|
|
||||||
pages_dict = defaultdict(dict)
|
_pages_dict = defaultdict(dict)
|
||||||
for file in files:
|
for file in files:
|
||||||
with open(file, 'r') as f:
|
with open(file, 'r') as f:
|
||||||
soup = BeautifulSoup(f.read(), "lxml")
|
soup = BeautifulSoup(f.read(), "lxml")
|
||||||
|
|
||||||
temp_link = soup.find_all(name='a',class_='internal-link')
|
temp_link = soup.find_all(name='a',class_='internal-link')
|
||||||
page_link = []
|
page_link = set()
|
||||||
for i in temp_link:
|
for i in temp_link:
|
||||||
i = i.get("href")
|
i = i.get("href")
|
||||||
page_link.append(i)
|
page_link.add(i)
|
||||||
|
|
||||||
nodes = soup.find_all(class_ = "NODE")
|
nodes = soup.find_all(class_ = "NODE")
|
||||||
nodes_dict = {}
|
nodes_dict = {}
|
||||||
|
@ -54,7 +55,7 @@ def soup_link():
|
||||||
while node_element.name != iii.name:
|
while node_element.name != iii.name:
|
||||||
node_content += str(iii)
|
node_content += str(iii)
|
||||||
iii = iii.find_next_sibling()
|
iii = iii.find_next_sibling()
|
||||||
if iii == None or (iii.name == 'div' and '🔗反向链接' in iii.text):
|
if iii == None:
|
||||||
break
|
break
|
||||||
|
|
||||||
node_id_link = '/' + file_name + '/#' +node_element["id"]
|
node_id_link = '/' + file_name + '/#' +node_element["id"]
|
||||||
|
@ -68,102 +69,116 @@ def soup_link():
|
||||||
if node_link in page_link:
|
if node_link in page_link:
|
||||||
page_link.remove(node_link)
|
page_link.remove(node_link)
|
||||||
|
|
||||||
pages_dict[file_name]["name"] = soup.title.string
|
_pages_dict[file_name]["name"] = soup.title.string
|
||||||
pages_dict[file_name]["links"] = page_link
|
_pages_dict[file_name]["path"] = file
|
||||||
pages_dict[file_name]["backlinks"] = []
|
_pages_dict[file_name]["links"] = page_link
|
||||||
pages_dict[file_name]["nodes"] = nodes_dict
|
_pages_dict[file_name]["backlinks"] = []
|
||||||
|
_pages_dict[file_name]["nodes"] = nodes_dict
|
||||||
|
|
||||||
return pages_dict
|
return _pages_dict
|
||||||
|
|
||||||
pages_dict = soup_link()
|
|
||||||
backlinks_dict = {}
|
def build_backlinks_dict(pages_dict):
|
||||||
backlinks_dict = pages_dict
|
backlinks_dict = pages_dict
|
||||||
pages_dict = dict(pages_dict)
|
|
||||||
|
|
||||||
for page in pages_dict:
|
for page in pages_dict:
|
||||||
try:
|
for _link in pages_dict[page]["links"]:
|
||||||
for link in pages_dict[page]["links"]:
|
if _link == "":
|
||||||
link = urllib.parse.unquote(link)
|
continue
|
||||||
if '#' in link:
|
|
||||||
link = link.split('/')
|
link = urllib.parse.unquote(_link).split("/")
|
||||||
|
_filter = [
|
||||||
|
i
|
||||||
|
for i in backlinks_dict
|
||||||
|
if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html"
|
||||||
|
]
|
||||||
|
if len(_filter) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0]
|
||||||
|
|
||||||
|
if "#" in link[-1]:
|
||||||
if len(link) > 2:
|
if len(link) > 2:
|
||||||
# ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']
|
# ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']
|
||||||
link_page = link[-2]
|
link_page = link[-2]
|
||||||
link_node = link[-1][1:]
|
link_node = link[-1][1:]
|
||||||
|
|
||||||
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(page)
|
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link)
|
||||||
|
else:
|
||||||
|
backlinks_dict[_filter[0]]["backlinks"].append(post_link)
|
||||||
|
|
||||||
for node in pages_dict[page]["nodes"]:
|
for node in pages_dict[page]["nodes"]:
|
||||||
for link in pages_dict[page]["nodes"][node]["links"]:
|
for node_link in pages_dict[page]["nodes"][node]["links"]:
|
||||||
link = urllib.parse.unquote(link)
|
node_link = urllib.parse.unquote(node_link)
|
||||||
link = link.split('/')
|
node_link = node_link.split('/')
|
||||||
|
|
||||||
if len(link) == 1:
|
if len(node_link) == 1:
|
||||||
link_node = link[0][1:]
|
link_node = node_link[0][1:]
|
||||||
link = '#' + node
|
node_link = '#' + node
|
||||||
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link)
|
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
|
||||||
elif len(link) > 2:
|
elif len(node_link) > 2:
|
||||||
# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
|
# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
|
||||||
if link[-2] == page:
|
if node_link[-2] == page:
|
||||||
link_node = link[-1][1:]
|
link_node = link[-1][1:]
|
||||||
link = link[-1]
|
node_link = node_link[-1]
|
||||||
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link)
|
backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
|
||||||
else:
|
else:
|
||||||
if link[-1].startswith('#'):
|
if node_link[-1].startswith('#'):
|
||||||
link_page = link[-2]
|
link_page = node_link[-2]
|
||||||
link_node = link_node = link[-1][1:]
|
link_node = node_link[-1][1:]
|
||||||
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
|
backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
|
||||||
else:
|
else:
|
||||||
link_page = link[-2]
|
link_page = node_link[-2]
|
||||||
backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
|
backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
|
||||||
except KeyError as e:
|
|
||||||
print('except:', e)
|
|
||||||
|
|
||||||
del(pages_dict)
|
return backlinks_dict
|
||||||
|
|
||||||
for page in backlinks_dict:
|
|
||||||
write_flag = 0
|
|
||||||
try:
|
|
||||||
with open('public/posts/' + page + '/index.html', 'r+') as f:
|
|
||||||
soup = BeautifulSoup(f.read(), "lxml")
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
print('except:', e)
|
|
||||||
continue
|
|
||||||
|
|
||||||
for i in backlinks_dict[page]["backlinks"]:
|
def write_backlinks(backlinks_dict):
|
||||||
|
def parse_page_backlinks(i):
|
||||||
backlink_filename = i.split('/')[-2]
|
backlink_filename = i.split('/')[-2]
|
||||||
backlink_nodename = i.split('/')[-1][1:]
|
backlink_nodename = i.split('/')[-1][1:]
|
||||||
|
try:
|
||||||
backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
|
backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
|
||||||
i = "/posts" + i
|
except KeyError:
|
||||||
|
_filter = [
|
||||||
|
x
|
||||||
|
for x in backlinks_dict
|
||||||
|
if backlinks_dict[x]["path"] == f"public{i}index.html"
|
||||||
|
][0]
|
||||||
|
backlink_name = backlinks_dict[_filter]["name"]
|
||||||
|
|
||||||
new_li = soup.new_tag("li")
|
new_li = soup.new_tag("li")
|
||||||
new_a = soup.new_tag("a", href=i)
|
new_a = soup.new_tag("a", href=i)
|
||||||
new_a.string = backlink_name
|
new_a.string = backlink_name
|
||||||
new_li.append(new_a)
|
new_li.append(new_a)
|
||||||
|
|
||||||
bl_section = soup.find(class_="bl-section")
|
bl_section = soup.find(class_="bl-section").find(name="ul")
|
||||||
if bl_section != None:
|
bl_section.append(new_li)
|
||||||
bl_section.find_next("ul").append(new_li)
|
|
||||||
write_flag = 1
|
|
||||||
|
|
||||||
for i in backlinks_dict[page]["nodes"]:
|
|
||||||
if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
|
def parse_node_backlink(i):
|
||||||
continue
|
|
||||||
new_bl = new_section(soup)
|
new_bl = new_section(soup)
|
||||||
for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
|
for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
|
||||||
link = ii.split('/')
|
link = ii.split("/")
|
||||||
if len(link) == 1:
|
|
||||||
if link[0].startswith('#'):
|
if link[-1].startswith("#"):
|
||||||
backlink_nodename = link[0][1:]
|
backlink_nodename = link[0][1:]
|
||||||
backlink_name = backlinks_dict[page]["nodes"][backlink_nodename]["name"].rstrip()
|
backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][
|
||||||
|
"name"
|
||||||
|
].rstrip()
|
||||||
|
|
||||||
|
# print(backlink_name)
|
||||||
else:
|
else:
|
||||||
backlink_nodename = link[0]
|
backlink_nodename = link[2]
|
||||||
backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() #FIXME
|
backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip()
|
||||||
ii = '/posts/' + backlink_nodename + '/'
|
# if len(link) > 1:
|
||||||
if len(link) > 1:
|
# backlink_pagename = link[2]
|
||||||
backlink_pagename = link[1]
|
# backlink_nodename = link[3][1:]
|
||||||
backlink_nodename = link[2][1:]
|
# print(link)
|
||||||
backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename]["name"].rstrip()
|
# print(page)
|
||||||
|
# backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][
|
||||||
|
# "name"
|
||||||
|
# ].rstrip()
|
||||||
|
|
||||||
new_li = soup.new_tag("li")
|
new_li = soup.new_tag("li")
|
||||||
new_a = soup.new_tag("a", href=ii)
|
new_a = soup.new_tag("a", href=ii)
|
||||||
|
@ -176,9 +191,6 @@ for page in backlinks_dict:
|
||||||
heading = int(bl_section.name[1:])
|
heading = int(bl_section.name[1:])
|
||||||
|
|
||||||
while (True):
|
while (True):
|
||||||
if iii.name == 'div' and '🔗反向链接' in iii.text:
|
|
||||||
iii.find_previous_sibling().append(new_bl)
|
|
||||||
break
|
|
||||||
if iii.find_next_sibling() == None:
|
if iii.find_next_sibling() == None:
|
||||||
iii.append(new_bl)
|
iii.append(new_bl)
|
||||||
break
|
break
|
||||||
|
@ -187,12 +199,30 @@ for page in backlinks_dict:
|
||||||
break
|
break
|
||||||
iii = iii.find_next_sibling()
|
iii = iii.find_next_sibling()
|
||||||
|
|
||||||
write_flag = 1
|
for page in backlinks_dict:
|
||||||
|
try:
|
||||||
|
with open(backlinks_dict[page]["path"], 'r') as f:
|
||||||
|
soup = BeautifulSoup(f.read(), "lxml")
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print('except:', e)
|
||||||
|
continue
|
||||||
|
|
||||||
if write_flag == 1:
|
content_section = soup.find(class_="content")
|
||||||
with open('public/posts/' + page + '/index.html', 'r+') as f:
|
content_section.insert_after(new_section(soup))
|
||||||
|
|
||||||
|
for i in backlinks_dict[page]["backlinks"]:
|
||||||
|
parse_page_backlinks(i)
|
||||||
|
for i in backlinks_dict[page]["nodes"]:
|
||||||
|
if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
|
||||||
|
continue
|
||||||
|
|
||||||
|
parse_node_backlink(i)
|
||||||
|
|
||||||
|
with open(backlinks_dict[page]["path"], 'r+') as f:
|
||||||
f.write(str(soup))
|
f.write(str(soup))
|
||||||
print("write " + page + '!')
|
print("write " + page + '!')
|
||||||
|
|
||||||
|
|
||||||
|
pages_dict = soup_link()
|
||||||
|
backlinks_dict = build_backlinks_dict(pages_dict)
|
||||||
|
write_backlinks(backlinks_dict)
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
<div class="content">
|
<div class="content">
|
||||||
<h1>{{ .Title }}</h1>
|
<h1>{{ .Title }}</h1>
|
||||||
{{ .Content }}
|
{{ .Content }}
|
||||||
{{ partial "backlinks.html" . }}
|
<!-- {{ partial "backlinks.html" . }} -->
|
||||||
</div>
|
</div>
|
||||||
{{ partial "comment.html" . }}
|
{{ partial "comment.html" . }}
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue