garden/soup.py

#!/usr/bin/env python3
import glob
import re
import urllib.parse
from collections import defaultdict
from bs4 import BeautifulSoup


def re_link(link):
    urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)
    return urls


def new_section(soup):
    new_div = soup.new_tag("div")
    new_div["class"] = 'bl-section'
    new_h4 = soup.new_tag("h4")
    new_h4.string = "反向链接"
    new_div.append(new_h4)
    bl_ul = soup.new_tag("ul")
    new_div.append(bl_ul)

    return new_div


def soup_link():
    files = glob.glob("public/main/*/*.html")
    files += glob.glob("public/daily/*/*.html")
    files += glob.glob("public/references/*/*.html")
    files += glob.glob("public/articles/*/*.html")

    _pages_dict = defaultdict(dict)
    for file in files:
        with open(file, 'r') as f:
            soup = BeautifulSoup(f.read(), "lxml")

        temp_link = soup.find_all(name='a',class_='internal-link')
        page_link = set()
        for i in temp_link:
            i = i.get("href")
            page_link.add(i)

        nodes = soup.find_all(class_ = "NODE")
        nodes_dict = {}
        file_name = file.split('/')[-2]

        for node in nodes:
            node_dict = {}
            node_element = node.parent.parent
            node_dict["name"] = node_element.contents[0]

            iii = node_element.find_next_sibling()
            node_content = ''
            while node_element.name != iii.name:
                node_content += str(iii)
                iii = iii.find_next_sibling()
                if iii == None:
                    break

            node_id_link =  '/' + file_name + '/#'  +node_element["id"]
            node_dict["links"] = re_link(node_content)
            node_dict["id-link"] = node_id_link
            node_dict["backlinks"] = []
            nodes_dict[node_element["id"]] = node_dict

        for i in nodes_dict:
            for node_link in nodes_dict[i]["links"]:
                if node_link in page_link:
                    page_link.remove(node_link)

        _pages_dict[file_name]["name"] = soup.title.string
        _pages_dict[file_name]["path"] = file
        _pages_dict[file_name]["links"] = page_link
        _pages_dict[file_name]["backlinks"] = []
        _pages_dict[file_name]["nodes"] = nodes_dict

    return _pages_dict


def build_backlinks_dict(pages_dict):
    backlinks_dict = pages_dict
    for page in pages_dict:
        for _link in pages_dict[page]["links"]:
            if _link == "":
                continue

            link = urllib.parse.unquote(_link).split("/")
            _filter = [
                i
                for i in backlinks_dict
                if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html"
            ]
            if len(_filter) == 0:
                continue

            post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0]

            if "#" in link[-1]:
                if len(link) > 2:
                    # ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']
                    link_page = link[-2]
                    link_node = link[-1][1:]

                    backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link)
            else:
                backlinks_dict[_filter[0]]["backlinks"].append(post_link)

        for node in pages_dict[page]["nodes"]:
            for node_link in pages_dict[page]["nodes"][node]["links"]:
                node_link = urllib.parse.unquote(node_link)
                node_link = node_link.split('/')

                if len(node_link) == 1:
                    link_node = node_link[0][1:]
                    node_link = '#' + node
                    backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
                elif len(node_link) > 2:
                    # ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']
                    if node_link[-2] == page:
                        link_node = link[-1][1:]
                        node_link = node_link[-1]
                        backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)
                    else:
                        if node_link[-1].startswith('#'):
                            link_page = node_link[-2]
                            link_node = node_link[-1][1:]
                            backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])
                        else:
                            link_page = node_link[-2]
                            backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])

    return backlinks_dict


def write_backlinks(backlinks_dict):
    def parse_page_backlinks(i):
        backlink_filename = i.split('/')[-2]
        backlink_nodename = i.split('/')[-1][1:]
        try:
            backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()
        except KeyError:
            _filter = [
                x
                for x in backlinks_dict
                if backlinks_dict[x]["path"] == f"public{i}index.html"
            ][0]
            backlink_name = backlinks_dict[_filter]["name"]

        new_li = soup.new_tag("li")
        new_a = soup.new_tag("a", href=i)
        new_a.string = backlink_name
        new_li.append(new_a)

        bl_section = soup.find(class_="bl-section").find(name="ul")
        bl_section.append(new_li)


    def parse_node_backlink(i):
        new_bl = new_section(soup)
        for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:
            link = ii.split("/")

            if link[-1].startswith("#"):
                backlink_nodename = link[0][1:]
                backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][
                    "name"
                ].rstrip()

                # print(backlink_name)
            else:
                backlink_nodename = link[2]
                backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip()
            # if len(link) > 1:
            #     backlink_pagename = link[2]
            #     backlink_nodename = link[3][1:]
            #     print(link)
            #     print(page)
            #     backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][
            #         "name"
            #     ].rstrip()

            new_li = soup.new_tag("li")
            new_a = soup.new_tag("a", href=ii)
            new_a.string = backlink_name
            new_li.append(new_a)
            new_bl.append(new_li)

            bl_section = soup.find(id=i)
            iii = bl_section.find_next_sibling()
            heading = int(bl_section.name[1:])

            while (True):
                if iii.find_next_sibling() == None:
                    iii.append(new_bl)
                    break
                if iii.name.startswith('h') and (int(iii.name[1:]) <= heading):
                    iii.find_previous_sibling().append(new_bl)
                    break
                iii = iii.find_next_sibling()

    for page in backlinks_dict:
        try:
            with open(backlinks_dict[page]["path"], 'r') as f:
                soup = BeautifulSoup(f.read(), "lxml")
        except FileNotFoundError as e:
            print('except:', e)
            continue

        content_section = soup.find(class_="content")
        content_section.insert_after(new_section(soup))

        for i in backlinks_dict[page]["backlinks"]:
            parse_page_backlinks(i)
        for i in backlinks_dict[page]["nodes"]:
            if backlinks_dict[page]["nodes"][i]["backlinks"] == []:
                continue

            parse_node_backlink(i)

        with open(backlinks_dict[page]["path"], "w", encoding="utf-8") as f:
            f.write(str(soup))
            print("write " + page + '!')


pages_dict = soup_link()
backlinks_dict = build_backlinks_dict(pages_dict)
write_backlinks(backlinks_dict)
new init 2023-05-20 13:32:59 +02:00			`#!/usr/bin/env python3`
			`import glob`
			`import re`
			`import urllib.parse`
			`from collections import defaultdict`
			`from bs4 import BeautifulSoup`



			`def re_link(link):`
			`urls = re.findall(r'href=[\'"]?([^\'" >]+)', link)`
			`return urls`

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00
new init 2023-05-20 13:32:59 +02:00			`def new_section(soup):`
			`new_div = soup.new_tag("div")`
			`new_div["class"] = 'bl-section'`
			`new_h4 = soup.new_tag("h4")`
			`new_h4.string = "反向链接"`
			`new_div.append(new_h4)`
			`bl_ul = soup.new_tag("ul")`
			`new_div.append(bl_ul)`

			`return new_div`


			`def soup_link():`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`files = glob.glob("public/main//.html")`
			`files += glob.glob("public/daily//.html")`
			`files += glob.glob("public/references//.html")`
			`files += glob.glob("public/articles//.html")`
new init 2023-05-20 13:32:59 +02:00
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`_pages_dict = defaultdict(dict)`
new init 2023-05-20 13:32:59 +02:00			`for file in files:`
			`with open(file, 'r') as f:`
			`soup = BeautifulSoup(f.read(), "lxml")`

			`temp_link = soup.find_all(name='a',class_='internal-link')`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`page_link = set()`
new init 2023-05-20 13:32:59 +02:00			`for i in temp_link:`
			`i = i.get("href")`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`page_link.add(i)`
new init 2023-05-20 13:32:59 +02:00
			`nodes = soup.find_all(class_ = "NODE")`
			`nodes_dict = {}`
			`file_name = file.split('/')[-2]`

			`for node in nodes:`
			`node_dict = {}`
			`node_element = node.parent.parent`
			`node_dict["name"] = node_element.contents[0]`

			`iii = node_element.find_next_sibling()`
			`node_content = ''`
			`while node_element.name != iii.name:`
			`node_content += str(iii)`
			`iii = iii.find_next_sibling()`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`if iii == None:`
new init 2023-05-20 13:32:59 +02:00			`break`

			`node_id_link = '/' + file_name + '/#' +node_element["id"]`
			`node_dict["links"] = re_link(node_content)`
			`node_dict["id-link"] = node_id_link`
			`node_dict["backlinks"] = []`
			`nodes_dict[node_element["id"]] = node_dict`

			`for i in nodes_dict:`
			`for node_link in nodes_dict[i]["links"]:`
			`if node_link in page_link:`
			`page_link.remove(node_link)`

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`_pages_dict[file_name]["name"] = soup.title.string`
			`_pages_dict[file_name]["path"] = file`
			`_pages_dict[file_name]["links"] = page_link`
			`_pages_dict[file_name]["backlinks"] = []`
			`_pages_dict[file_name]["nodes"] = nodes_dict`
new init 2023-05-20 13:32:59 +02:00
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`return _pages_dict`


			`def build_backlinks_dict(pages_dict):`
			`backlinks_dict = pages_dict`
			`for page in pages_dict:`
			`for _link in pages_dict[page]["links"]:`
			`if _link == "":`
			`continue`

			`link = urllib.parse.unquote(_link).split("/")`
			`_filter = [`
			`i`
			`for i in backlinks_dict`
			`if backlinks_dict[i]["path"] == f"public/{link[-3]}/{link[-2]}/index.html"`
			`]`
			`if len(_filter) == 0:`
			`continue`

			`post_link = re.findall(r"public(.*)index.html", pages_dict[page]["path"])[0]`

			`if "#" in link[-1]:`
new init 2023-05-20 13:32:59 +02:00			`if len(link) > 2:`
			`# ['', 'posts', '卡片笔记写作法', '#要如何在心流状态下写作-p39']`
			`link_page = link[-2]`
			`link_node = link[-1][1:]`

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(post_link)`
			`else:`
			`backlinks_dict[_filter[0]]["backlinks"].append(post_link)`
new init 2023-05-20 13:32:59 +02:00
			`for node in pages_dict[page]["nodes"]:`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`for node_link in pages_dict[page]["nodes"][node]["links"]:`
			`node_link = urllib.parse.unquote(node_link)`
			`node_link = node_link.split('/')`

			`if len(node_link) == 1:`
			`link_node = node_link[0][1:]`
			`node_link = '#' + node`
			`backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)`
			`elif len(node_link) > 2:`
new init 2023-05-20 13:32:59 +02:00			`# ['', 'posts', '卡片笔记写作法', '#卡片笔记中有哪些索引-p9']`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`if node_link[-2] == page:`
new init 2023-05-20 13:32:59 +02:00			`link_node = link[-1][1:]`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`node_link = node_link[-1]`
			`backlinks_dict[page]["nodes"][link_node]["backlinks"].append(node_link)`
new init 2023-05-20 13:32:59 +02:00			`else:`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`if node_link[-1].startswith('#'):`
			`link_page = node_link[-2]`
			`link_node = node_link[-1][1:]`
new init 2023-05-20 13:32:59 +02:00			`backlinks_dict[link_page]["nodes"][link_node]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])`
			`else:`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`link_page = node_link[-2]`
new init 2023-05-20 13:32:59 +02:00			`backlinks_dict[link_page]["backlinks"].append(pages_dict[page]["nodes"][node]["id-link"])`

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`return backlinks_dict`
new init 2023-05-20 13:32:59 +02:00

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`def write_backlinks(backlinks_dict):`
			`def parse_page_backlinks(i):`
new init 2023-05-20 13:32:59 +02:00			`backlink_filename = i.split('/')[-2]`
			`backlink_nodename = i.split('/')[-1][1:]`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`try:`
			`backlink_name = backlinks_dict[backlink_filename]["nodes"][backlink_nodename]["name"].rstrip()`
			`except KeyError:`
			`_filter = [`
			`x`
			`for x in backlinks_dict`
			`if backlinks_dict[x]["path"] == f"public{i}index.html"`
			`][0]`
			`backlink_name = backlinks_dict[_filter]["name"]`
new init 2023-05-20 13:32:59 +02:00
			`new_li = soup.new_tag("li")`
			`new_a = soup.new_tag("a", href=i)`
			`new_a.string = backlink_name`
			`new_li.append(new_a)`

[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`bl_section = soup.find(class_="bl-section").find(name="ul")`
			`bl_section.append(new_li)`
new init 2023-05-20 13:32:59 +02:00
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00
			`def parse_node_backlink(i):`
new init 2023-05-20 13:32:59 +02:00			`new_bl = new_section(soup)`
			`for ii in backlinks_dict[page]["nodes"][i]["backlinks"]:`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`link = ii.split("/")`

			`if link[-1].startswith("#"):`
			`backlink_nodename = link[0][1:]`
			`backlink_name = backlinks_dict[page]["nodes"][backlink_nodename][`
			`"name"`
			`].rstrip()`

			`# print(backlink_name)`
			`else:`
			`backlink_nodename = link[2]`
			`backlink_name = backlinks_dict[backlink_nodename]["name"].rstrip()`
			`# if len(link) > 1:`
			`# backlink_pagename = link[2]`
			`# backlink_nodename = link[3][1:]`
			`# print(link)`
			`# print(page)`
			`# backlink_name = backlinks_dict[backlink_pagename]["nodes"][backlink_nodename][`
			`# "name"`
			`# ].rstrip()`
new init 2023-05-20 13:32:59 +02:00
			`new_li = soup.new_tag("li")`
			`new_a = soup.new_tag("a", href=ii)`
			`new_a.string = backlink_name`
			`new_li.append(new_a)`
			`new_bl.append(new_li)`
[feat] build backlink section by python 2024-06-04 16:29:50 +02:00
			`bl_section = soup.find(id=i)`
			`iii = bl_section.find_next_sibling()`
			`heading = int(bl_section.name[1:])`

			`while (True):`
			`if iii.find_next_sibling() == None:`
			`iii.append(new_bl)`
			`break`
			`if iii.name.startswith('h') and (int(iii.name[1:]) <= heading):`
			`iii.find_previous_sibling().append(new_bl)`
			`break`
			`iii = iii.find_next_sibling()`

			`for page in backlinks_dict:`
			`try:`
			`with open(backlinks_dict[page]["path"], 'r') as f:`
			`soup = BeautifulSoup(f.read(), "lxml")`
			`except FileNotFoundError as e:`
			`print('except:', e)`
			`continue`

			`content_section = soup.find(class_="content")`
			`content_section.insert_after(new_section(soup))`

			`for i in backlinks_dict[page]["backlinks"]:`
			`parse_page_backlinks(i)`
			`for i in backlinks_dict[page]["nodes"]:`
			`if backlinks_dict[page]["nodes"][i]["backlinks"] == []:`
			`continue`

			`parse_node_backlink(i)`

[fix] ensure bs4 write back normal 2024-06-19 05:14:43 +02:00			`with open(backlinks_dict[page]["path"], "w", encoding="utf-8") as f:`
new init 2023-05-20 13:32:59 +02:00			`f.write(str(soup))`
			`print("write " + page + '!')`


[feat] build backlink section by python 2024-06-04 16:29:50 +02:00			`pages_dict = soup_link()`
			`backlinks_dict = build_backlinks_dict(pages_dict)`
			`write_backlinks(backlinks_dict)`