From e2b239d7059fdbf90cc49ce8e8ab75bf1cd70793 Mon Sep 17 00:00:00 2001 From: SouthFox Date: Tue, 4 Jun 2024 22:29:50 +0800 Subject: [PATCH] [feat] build backlink section by python --- soup.py | 218 ++++++++++++--------- themes/cortex/layouts/_default/single.html | 2 +- 2 files changed, 125 insertions(+), 95 deletions(-) diff --git a/soup.py b/soup.py index 3387319..05bc68a 100644 --- a/soup.py +++ b/soup.py @@ -11,15 +11,13 @@ def re_link(link): urls = re.findall(r'href=[\'"]?([^\'" >]+)', link) return urls + def new_section(soup): new_div = soup.new_tag("div") new_div["class"] = 'bl-section' new_h4 = soup.new_tag("h4") new_h4.string = "反向链接" new_div.append(new_h4) - bl_div = soup.new_tag("div") - bl_div["class"] = 'backlinks' - new_div.append(bl_div) bl_ul = soup.new_tag("ul") new_div.append(bl_ul) @@ -27,18 +25,21 @@ def new_section(soup): def soup_link(): - files = glob.glob("public/posts/*/*.html") + files = glob.glob("public/main/*/*.html") + files += glob.glob("public/daily/*/*.html") + files += glob.glob("public/references/*/*.html") + files += glob.glob("public/articles/*/*.html") - pages_dict = defaultdict(dict) + _pages_dict = defaultdict(dict) for file in files: with open(file, 'r') as f: soup = BeautifulSoup(f.read(), "lxml") temp_link = soup.find_all(name='a',class_='internal-link') - page_link = [] + page_link = set() for i in temp_link: i = i.get("href") - page_link.append(i) + page_link.add(i) nodes = soup.find_all(class_ = "NODE") nodes_dict = {} @@ -54,7 +55,7 @@ def soup_link(): while node_element.name != iii.name: node_content += str(iii) iii = iii.find_next_sibling() - if iii == None or (iii.name == 'div' and '🔗反向链接' in iii.text): + if iii == None: break node_id_link = '/' + file_name + '/#' +node_element["id"] @@ -68,131 +69,160 @@ def soup_link(): if node_link in page_link: page_link.remove(node_link) - pages_dict[file_name]["name"] = soup.title.string - pages_dict[file_name]["links"] = page_link - pages_dict[file_name]["backlinks"] = [] - pages_dict[file_name]["nodes"] = nodes_dict - - return pages_dict + _pages_dict[file_name]["name"] = soup.title.string + _pages_dict[file_name]["path"] = file + _pages_dict[file_name]["links"] = page_link + _pages_dict[file_name]["backlinks"] = [] + _pages_dict[file_name]["nodes"] = nodes_dict -pages_dict = soup_link() -backlinks_dict = {} -backlinks_dict = pages_dict -pages_dict = dict(pages_dict) + return _pages_dict -for page in pages_dict: - try: - for link in pages_dict[page]["links"]: - link = urllib.parse.unquote(link) - if '#' in link: - link = link.split('/') + +def build_backlinks_dict(pages_dict): + backlinks_dict = pages_dict + for page in pages_dict: + for _link in pages_dict[page]["links"]: + if _link == "": + continue + + link = urllib.parse.unquote(_link).split("/") + _filter = [ + i + for i in backlinks_dict + if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html" + ] + if len(_filter) == 0: + continue + + post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0] + + if "#" in link[-1]: if len(link) > 2: # ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39'] link_page = link[-2] link_node = link[-1][1:] - backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(page) + backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link) + else: + backlinks_dict[_filter[0]]["backlinks"].append(post_link) for node in pages_dict[page]["nodes"]: - for link in pages_dict[page]["nodes"][node]["links"]: - link = urllib.parse.unquote(link) - link = link.split('/') + for node_link in pages_dict[page]["nodes"][node]["links"]: + node_link = urllib.parse.unquote(node_link) + node_link = node_link.split('/') - if len(link) == 1: - link_node = link[0][1:] - link = '#' + node - backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) - elif len(link) > 2: + if len(node_link) == 1: + link_node = node_link[0][1:] + node_link = '#' + node + backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link) + elif len(node_link) > 2: # ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9'] - if link[-2] == page: + if node_link[-2] == page: link_node = link[-1][1:] - link = link[-1] - backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) + node_link = node_link[-1] + backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link) else: - if link[-1].startswith('#'): - link_page = link[-2] - link_node = link_node = link[-1][1:] + if node_link[-1].startswith('#'): + link_page = node_link[-2] + link_node = node_link[-1][1:] backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) else: - link_page = link[-2] + link_page = node_link[-2] backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) - except KeyError as e: - print('except:', e) -del(pages_dict) + return backlinks_dict -for page in backlinks_dict: - write_flag = 0 - try: - with open('public/posts/' + page + '/index.html', 'r+') as f: - soup = BeautifulSoup(f.read(), "lxml") - except FileNotFoundError as e: - print('except:', e) - continue - for i in backlinks_dict[page]["backlinks"]: +def write_backlinks(backlinks_dict): + def parse_page_backlinks(i): backlink_filename = i.split('/')[-2] backlink_nodename = i.split('/')[-1][1:] - backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip() - i = "/posts" + i + try: + backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip() + except KeyError: + _filter = [ + x + for x in backlinks_dict + if backlinks_dict[x]["path"] == f"public{i}index.html" + ][0] + backlink_name = backlinks_dict[_filter]["name"] new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=i) new_a.string = backlink_name new_li.append(new_a) - bl_section = soup.find(class_="bl-section") - if bl_section != None: - bl_section.find_next("ul").append(new_li) - write_flag = 1 + bl_section = soup.find(class_="bl-section").find(name="ul") + bl_section.append(new_li) - for i in backlinks_dict[page]["nodes"]: - if backlinks_dict[page]["nodes"][i]["backlinks"] == []: - continue + + def parse_node_backlink(i): new_bl = new_section(soup) for ii in backlinks_dict[page]["nodes"][i]["backlinks"]: - link = ii.split('/') - if len(link) == 1: - if link[0].startswith('#'): - backlink_nodename = link[0][1:] - backlink_name = backlinks_dict[page]["nodes"][backlink_nodename]["name"].rstrip() - else: - backlink_nodename = link[0] - backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() #FIXME - ii = '/posts/' + backlink_nodename + '/' - if len(link) > 1: - backlink_pagename = link[1] - backlink_nodename = link[2][1:] - backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename]["name"].rstrip() + link = ii.split("/") + + if link[-1].startswith("#"): + backlink_nodename = link[0][1:] + backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][ + "name" + ].rstrip() + + # print(backlink_name) + else: + backlink_nodename = link[2] + backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() + # if len(link) > 1: + # backlink_pagename = link[2] + # backlink_nodename = link[3][1:] + # print(link) + # print(page) + # backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][ + # "name" + # ].rstrip() new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=ii) new_a.string = backlink_name new_li.append(new_a) new_bl.append(new_li) - - bl_section = soup.find(id=i) - iii = bl_section.find_next_sibling() - heading = int(bl_section.name[1:]) - while (True): - if iii.name == 'div' and '🔗反向链接' in iii.text: - iii.find_previous_sibling().append(new_bl) - break - if iii.find_next_sibling() == None: - iii.append(new_bl) - break - if iii.name.startswith('h') and (int(iii.name[1:]) <= heading): - iii.find_previous_sibling().append(new_bl) - break - iii = iii.find_next_sibling() - - write_flag = 1 + bl_section = soup.find(id=i) + iii = bl_section.find_next_sibling() + heading = int(bl_section.name[1:]) - if write_flag == 1: - with open('public/posts/' + page + '/index.html', 'r+') as f: + while (True): + if iii.find_next_sibling() == None: + iii.append(new_bl) + break + if iii.name.startswith('h') and (int(iii.name[1:]) <= heading): + iii.find_previous_sibling().append(new_bl) + break + iii = iii.find_next_sibling() + + for page in backlinks_dict: + try: + with open(backlinks_dict[page]["path"], 'r') as f: + soup = BeautifulSoup(f.read(), "lxml") + except FileNotFoundError as e: + print('except:', e) + continue + + content_section = soup.find(class_="content") + content_section.insert_after(new_section(soup)) + + for i in backlinks_dict[page]["backlinks"]: + parse_page_backlinks(i) + for i in backlinks_dict[page]["nodes"]: + if backlinks_dict[page]["nodes"][i]["backlinks"] == []: + continue + + parse_node_backlink(i) + + with open(backlinks_dict[page]["path"], 'r+') as f: f.write(str(soup)) print("write " + page + '!') - +pages_dict = soup_link() +backlinks_dict = build_backlinks_dict(pages_dict) +write_backlinks(backlinks_dict) diff --git a/themes/cortex/layouts/_default/single.html b/themes/cortex/layouts/_default/single.html index eb3a49b..169493f 100644 --- a/themes/cortex/layouts/_default/single.html +++ b/themes/cortex/layouts/_default/single.html @@ -5,7 +5,7 @@

{{ .Title }}

{{ .Content }} - {{ partial "backlinks.html" . }} +
{{ partial "comment.html" . }}