#!/usr/bin/env python3 import glob import re import urllib.parse from collections import defaultdict from bs4 import BeautifulSoup def re_link(link): urls = re.findall(r'href=[\'"]?([^\'" >]+)', link) return urls def new_section(soup): new_div = soup.new_tag("div") new_div["class"] = 'bl-section' new_h4 = soup.new_tag("h4") new_h4.string = "反向链接" new_div.append(new_h4) bl_div = soup.new_tag("div") bl_div["class"] = 'backlinks' new_div.append(bl_div) bl_ul = soup.new_tag("ul") new_div.append(bl_ul) return new_div def soup_link(): files = glob.glob("public/posts/*/*.html") pages_dict = defaultdict(dict) for file in files: with open(file, 'r') as f: soup = BeautifulSoup(f.read(), "lxml") temp_link = soup.find_all(name='a',class_='internal-link') page_link = [] for i in temp_link: i = i.get("href") page_link.append(i) nodes = soup.find_all(class_ = "NODE") nodes_dict = {} file_name = file.split('/')[-2] for node in nodes: node_dict = {} node_element = node.parent.parent node_dict["name"] = node_element.contents[0] iii = node_element.find_next_sibling() node_content = '' while node_element.name != iii.name: node_content += str(iii) iii = iii.find_next_sibling() if iii == None or (iii.name == 'div' and '🔗反向链接' in iii.text): break node_id_link = '/' + file_name + '/#' +node_element["id"] node_dict["links"] = re_link(node_content) node_dict["id-link"] = node_id_link node_dict["backlinks"] = [] nodes_dict[node_element["id"]] = node_dict for i in nodes_dict: for node_link in nodes_dict[i]["links"]: if node_link in page_link: page_link.remove(node_link) pages_dict[file_name]["name"] = soup.title.string pages_dict[file_name]["links"] = page_link pages_dict[file_name]["backlinks"] = [] pages_dict[file_name]["nodes"] = nodes_dict return pages_dict pages_dict = soup_link() backlinks_dict = {} backlinks_dict = pages_dict pages_dict = dict(pages_dict) for page in pages_dict: try: for link in pages_dict[page]["links"]: link = urllib.parse.unquote(link) if '#' in link: link = link.split('/') if len(link) > 2: # ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39'] link_page = link[-2] link_node = link[-1][1:] backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(page) for node in pages_dict[page]["nodes"]: for link in pages_dict[page]["nodes"][node]["links"]: link = urllib.parse.unquote(link) link = link.split('/') if len(link) == 1: link_node = link[0][1:] link = '#' + node backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) elif len(link) > 2: # ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9'] if link[-2] == page: link_node = link[-1][1:] link = link[-1] backlinks_dict[page]["nodes"][link_node]["backlinks"].append(link) else: if link[-1].startswith('#'): link_page = link[-2] link_node = link_node = link[-1][1:] backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) else: link_page = link[-2] backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"]) except KeyError as e: print('except:', e) del(pages_dict) for page in backlinks_dict: write_flag = 0 try: with open('public/posts/' + page + '/index.html', 'r+') as f: soup = BeautifulSoup(f.read(), "lxml") except FileNotFoundError as e: print('except:', e) continue for i in backlinks_dict[page]["backlinks"]: backlink_filename = i.split('/')[-2] backlink_nodename = i.split('/')[-1][1:] backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip() i = "/posts" + i new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=i) new_a.string = backlink_name new_li.append(new_a) bl_section = soup.find(class_="bl-section") if bl_section != None: bl_section.find_next("ul").append(new_li) write_flag = 1 for i in backlinks_dict[page]["nodes"]: if backlinks_dict[page]["nodes"][i]["backlinks"] == []: continue new_bl = new_section(soup) for ii in backlinks_dict[page]["nodes"][i]["backlinks"]: link = ii.split('/') if len(link) == 1: if link[0].startswith('#'): backlink_nodename = link[0][1:] backlink_name = backlinks_dict[page]["nodes"][backlink_nodename]["name"].rstrip() else: backlink_nodename = link[0] backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip() #FIXME ii = '/posts/' + backlink_nodename + '/' if len(link) > 1: backlink_pagename = link[1] backlink_nodename = link[2][1:] backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename]["name"].rstrip() new_li = soup.new_tag("li") new_a = soup.new_tag("a", href=ii) new_a.string = backlink_name new_li.append(new_a) new_bl.append(new_li) bl_section = soup.find(id=i) iii = bl_section.find_next_sibling() heading = int(bl_section.name[1:]) while (True): if iii.name == 'div' and '🔗反向链接' in iii.text: iii.find_previous_sibling().append(new_bl) break if iii.find_next_sibling() == None: iii.append(new_bl) break if iii.name.startswith('h') and (int(iii.name[1:]) <= heading): iii.find_previous_sibling().append(new_bl) break iii = iii.find_next_sibling() write_flag = 1 if write_flag == 1: with open('public/posts/' + page + '/index.html', 'r+') as f: f.write(str(soup)) print("write " + page + '!')