#!/usr/bin/env python3 import glob import re import urllib.parse from collections import defaultdict from bs4 import BeautifulSoup def re_link(link): urls = re.findall(r'href=[\'"]?([^\'" >]+)', link) return urls def new_section(soup): new_div = soup.new_tag("div") new_div["class"] = 'bl-section' new_h4 = soup.new_tag("h4") new_h4.string = "反向链接" new_div.append(new_h4) bl_ul = soup.new_tag("ul") new_div.append(bl_ul) return new_div def soup_link(): files = glob.glob("public/main/*/*.html") files += glob.glob("public/daily/*/*.html") files += glob.glob("public/references/*/*.html") files += glob.glob("public/articles/*/*.html") _pages_dict = defaultdict(dict) for file in files: with open(file, 'r') as f: soup = BeautifulSoup(f.read(), "lxml") temp_link = soup.find_all(name='a',class_='internal-link') page_link = set() for i in temp_link: i = i.get("href") page_link.add(i) nodes = soup.find_all(class_ = "NODE") nodes_dict = {} file_name = file.split('/')[-2] for node in nodes: node_dict = {} node_element = node.parent.parent node_dict["name"] = node_element.contents[0] iii = node_element.find_next_sibling() node_content = '' while node_element.name != iii.name: node_content += str(iii) iii = iii.find_next_sibling() if iii == None: break node_id_link = '/' + file_name + '/#' +node_element["id"] node_dict["links"] = re_link(node_content) node_dict["id-link"] = node_id_link node_dict["backlinks"] = [] nodes_dict[node_element["id"]] = node_dict for i in nodes_dict: for node_link in nodes_dict[i]["links"]: if node_link in page_link: page_link.remove(node_link) _pages_dict[file_name]["name"] = soup.title.string _pages_dict[file_name]["path"] = file _pages_dict[file_name]["links"] = page_link _pages_dict[file_name]["backlinks"] = [] _pages_dict[file_name]["nodes"] = nodes_dict return _pages_dict def build_backlinks_dict(pages_dict): backlinks_dict = pages_dict for page in pages_dict: for _link in pages_dict[page]["links"]: if _link == "": continue link = urllib.parse.unquote(_link).split("/") _filter = [ i for i in backlinks_dict if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html" ] if len(_filter) == 0: continue post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0] if "#" in link[-1]: if len(link) > 2: # ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39'] link_page = link[-2] link_node = link[-1][1:] backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link) else: backlinks_dict[_filter[0]]["backlinks"].append(post_link) for node in pages_dict[page]["nodes"]: for node_link in pages_dict[page]["nodes"][node]["links"]: node_link = urllib.parse.unquote(node_link) node_link = node_link.split('/') if len(node_link) == 1: link_node = node_link[0][1:] node_link = '#' + node backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link) elif len(node_link) > 2: # ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9'] if node_link[-2] == page: link_node = link[-1][1:] node_link = node_link[-1] backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link) else: if node_link[-1].startswith('#'): link_page = node_link[-2] link_node = node_link[-1][1:] backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) else: link_page = node_link[-2] backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) return backlinks_dict def write_backlinks(backlinks_dict): def parse_page_backlinks(i): backlink_filename = i.split('/')[-2] backlink_nodename = i.split('/')[-1][1:] try: backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip() except KeyError: _filter = [ x for x in backlinks_dict if backlinks_dict[x]["path"] == f"public{i}index.html" ][0] backlink_name = backlinks_dict[_filter]["name"] new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=i) new_a.string = backlink_name new_li.append(new_a) bl_section = soup.find(class_="bl-section").find(name="ul") bl_section.append(new_li) def parse_node_backlink(i): new_bl = new_section(soup) for ii in backlinks_dict[page]["nodes"][i]["backlinks"]: link = ii.split("/") if link[-1].startswith("#"): backlink_nodename = link[0][1:] backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][ "name" ].rstrip() # print(backlink_name) else: backlink_nodename = link[2] backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() # if len(link) > 1: # backlink_pagename = link[2] # backlink_nodename = link[3][1:] # print(link) # print(page) # backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][ # "name" # ].rstrip() new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=ii) new_a.string = backlink_name new_li.append(new_a) new_bl.append(new_li) bl_section = soup.find(id=i) iii = bl_section.find_next_sibling() heading = int(bl_section.name[1:]) while (True): if iii.find_next_sibling() == None: iii.append(new_bl) break if iii.name.startswith('h') and (int(iii.name[1:]) <= heading): iii.find_previous_sibling().append(new_bl) break iii = iii.find_next_sibling() for page in backlinks_dict: try: with open(backlinks_dict[page]["path"], 'r') as f: soup = BeautifulSoup(f.read(), "lxml") except FileNotFoundError as e: print('except:', e) continue content_section = soup.find(class_="content") content_section.insert_after(new_section(soup)) for i in backlinks_dict[page]["backlinks"]: parse_page_backlinks(i) for i in backlinks_dict[page]["nodes"]: if backlinks_dict[page]["nodes"][i]["backlinks"] == []: continue parse_node_backlink(i) with open(backlinks_dict[page]["path"], "w", encoding="utf-8") as f: f.write(str(soup)) print("write " + page + '!') pages_dict = soup_link() backlinks_dict = build_backlinks_dict(pages_dict) write_backlinks(backlinks_dict)