#!/usr/bin/env python # -*- coding: utf-8 -*- # ******************************************************************************** # Copyright © 2017-2020 jianglin # File Name: document.py # Author: jianglin # Email: mail@honmaple.com # Created: 2018-02-26 11:44:43 (CST) # Last Update: Wednesday 2020-08-19 12:00:03 (CST) # Description: # ******************************************************************************** import re from hashlib import sha1 from textwrap import dedent from .inline import Blankline, Hr, InlineText from .src import highlight as src_highlight DRAWER_BEGIN_REGEXP = re.compile(r"^(\s*):(\S+):\s*$") DRAWER_END_REGEXP = re.compile(r"^(\s*):END:\s*$") DRAWER_PROPERTY_REGEXP = re.compile(r"^(\s*):(\S+):(\s+(.*)$|$)") BLOCK_BEGIN_REGEXP = re.compile(r"(?i)^(\s*)#\+BEGIN_(\w+)(.*)") BLOCK_END_REGEXP = re.compile(r"(?i)^(\s*)#\+END_(\w+)") BLOCK_RESULT_REGEXP = re.compile(r"(?i)^(\s*)#\+RESULTS:") BLOCK_RESULT_CONTENT_REGEXP = re.compile(r"(?:^|\s+):(\s+(.*)|$)") TABLE_SEP_REGEXP = re.compile(r"^(\s*)(\|[+-|]*)\s*$") TABLE_ROW_REGEXP = re.compile(r"^(\s*)(\|.*)") TABLE_ALIGN_REGEXP = re.compile(r"^<(l|c|r)>$") LIST_DESCRIPTIVE_REGEXP = re.compile(r"^(\s*)([+*-])\s+(.*)::(\s|$)") LIST_UNORDER_REGEXP = re.compile(r"^(\s*)([+*-])(\s+(.*)|$)") LIST_ORDER_REGEXP = re.compile(r"^(\s*)(([0-9]+|[a-zA-Z])[.)])(\s+(.*)|$)") LIST_STATUS_REGEXP = re.compile(r"\[( |X|-)\]\s") LIST_LEVEL_REGEXP = re.compile(r"(\s*)(.+)$") HEADLINE_REGEXP = re.compile( r"^(\*+)(?:\s+(.+?))?(?:\s+\[#(.+)\])?(\s+.*?)(?:\s+:(.+):)?$") KEYWORD_REGEXP = re.compile(r"^(\s*)#\+([^:]+):(\s+(.*)|$)") COMMENT_REGEXP = re.compile(r"^(\s*)#(.*)") ATTRIBUTE_REGEXP = re.compile(r"(?:^|\s+)(:[-\w]+)\s+(.*)$") TODO_KEYWORDS = ("DONE", "TODO") def string_split(s, sep): if not s: return [] return s.split(sep) class Parser(object): def __init__(self, content=""): self.lines = content.splitlines() self.level = 0 self.element = "" self.children = [] self.escape = True self.needparse = True self.parsed_nodes = ( "blankline", "headline", "table", "list", "drawer", "block", "block_result", "keyword", "hr", ) def first_child(self): if len(self.children) == 0: return return self.children[0] def last_child(self): if len(self.children) == 0: return return self.children[-1] def add_child(self, node): last = self.last_child() if self.is_headline(last): if self.is_properties(node): last.properties = node return if not self.is_headline(node): last.add_child(node) return if self.is_headline(node) and node.stars > last.stars: last.add_child(node) return if self.is_table(last): if self.is_table(node): last.add_child(node) return if self.is_list(last): if self.is_blankline(node): last.add_child(node) return if node.level > last.level: last.add_child(node) return if self.is_list(node) and node.level == last.level: last.add_child(node) return if self.is_keyword(last): if self.is_table(node): node.keyword = last if self.is_paragraph(last): if self.is_inlinetext(node): last.add_child(node) return if self.is_inlinetext(node): self.children.append(self.paragraph(node)) return self.children.append(node) def is_keyword(self, child): return child and isinstance(child, Keyword) def is_headline(self, child): return child and isinstance(child, Headline) def is_list(self, child): return child and isinstance(child, List) def is_table(self, child): return child and isinstance(child, Table) def is_src(self, child): return child and isinstance(child, (Src, Example)) def is_inlinetext(self, child): return child and isinstance(child, InlineText) def is_blankline(self, child): return child and isinstance(child, Blankline) def is_paragraph(self, child): return child and isinstance(child, Paragraph) def is_properties(self, child): return child and isinstance(child, Properties) def inlinetext(self, text): return InlineText(text, self.needparse, self.escape) def paragraph(self, node): n = Paragraph() n.add_child(node) return n def _parse_paired(self, cls, index, lines): node = cls.match(lines[index]) if not node: return None, index end = len(lines) num = index + 1 while num < end: if node.matchend(num, lines): node.preparse(lines[index + 1:num]) return node, num num += 1 return None, index def _parse_nopaired(self, cls, index, lines): node = cls.match(lines[index]) if not node: return None, index end = len(lines) num = index + 1 while num < end: if node.matchend(num, lines): break num += 1 node.preparse(lines[index + 1:num]) return node, num def parse_headline(self, index, lines): return Headline.match(lines[index]), index def parse_list(self, index, lines): return List.match(lines[index]), index def parse_table(self, index, lines): return self._parse_nopaired(Table, index, lines) def parse_drawer(self, index, lines): return self._parse_paired(Drawer, index, lines) def parse_block(self, index, lines): return self._parse_paired(Block, index, lines) def parse_block_result(self, index, lines): return self._parse_paired(BlockResult, index, lines) def parse_blankline(self, index, lines): return Blankline.match(lines[index]), index def parse_keyword(self, index, lines): return Keyword.match(lines[index]), index def parse_hr(self, index, lines): return Hr.match(lines[index]), index def parse_inlinetext(self, index, lines): return self.inlinetext(lines[index]), index def parse(self, index, lines): for b in self.parsed_nodes: func = "parse_" + b if not hasattr(self, func): continue block, num = getattr(self, func)(index, lines) if not block: continue return block, num return self.parse_inlinetext(index, lines) def preparse(self, lines): index = 0 while index < len(lines): line = lines[index] node, index = self.parse(index, lines) if node: node.level = len(line) - len(line.strip()) self.add_child(node) index += 1 def to_html(self): if len(self.children) == 0 and len(self.lines) > 0: self.preparse(self.lines) children = [] for child in self.children: content = child.to_html() if not content: continue children.append(content) text = "\n".join(children) if self.element: return self.element.format(text) return text def __str__(self): str_children = [str(child) for child in self.children] return self.__class__.__name__ + '(' + ','.join(str_children) + ')' def __repr__(self): return self.__str__() class Headline(Parser): def __init__( self, title, stars=1, keyword=None, priority=None, tags=[], todo_keywords=TODO_KEYWORDS): super(Headline, self).__init__() self.title = title self.stars = stars self.keyword = keyword self.priority = priority self.tags = tags self.properties = None self.todo_keywords = todo_keywords @classmethod def match(cls, line): match = HEADLINE_REGEXP.match(line) if not match: return stars = len(match[1]) keyword = match[2] or "" priority = match[3] or "" if keyword and not priority: if len(keyword) >= 4 and keyword[0:2] == "[#": priority = keyword[2:-1] keyword = "" title = keyword + match[4] keyword = "" return cls( title, stars, keyword, priority, string_split(match[5], ":"), ) def id(self): hid = 'org-{0}'.format(sha1(self.title.encode()).hexdigest()[:10]) if self.properties: return self.properties.get("CUSTOM_ID", hid) return hid def toc(self): b = "" if self.keyword: b = b + "{0}".format(self.keyword) if self.priority: b = b + "{0}".format(self.priority) b = b + self.inlinetext(self.title).to_html() for tag in self.tags: b = b + "{0}".format(tag) return b.strip() def to_html(self): b = "{2}".format( self.stars, self.id(), self.toc(), ) return b + super(Headline, self).to_html() class Drawer(Parser): def __init__(self, name): super(Drawer, self).__init__() self.name = name @classmethod def match(cls, line): match = DRAWER_BEGIN_REGEXP.match(line) if not match: return name = match[2] if name.upper() == "PROPERTIES": return Properties(name) return Drawer(name) def matchend(self, index, lines): return DRAWER_END_REGEXP.match(lines[index]) def to_html(self): return "" class Properties(Drawer): def __init__(self, name): super(Properties, self).__init__(name) self.properties = {} def parse(self, index, lines): match = DRAWER_PROPERTY_REGEXP.match(lines[index]) if match: self.properties[match[2].upper()] = match[4] return None, index def get(self, key, default=None): return self.properties.get(key, default) def to_html(self): return "" class Block(Parser): def __init__(self, name, params=""): super(Block, self).__init__() self.name = name self.params = params @classmethod def match(cls, line): match = BLOCK_BEGIN_REGEXP.match(line) if not match: return name = match[2].lower() if name == "src": return Src(*match[3].strip().split(" ", 1)) if name == "example": return Example(match[3]) if name == "center": return Center(match[3]) if name == "verse": return Verse(match[3]) if name == "quote": return Quote(match[3]) if name == "export": return Export(*match[3].strip().split(" ", 1)) return cls(name, match[3]) def matchend(self, index, lines): match = BLOCK_END_REGEXP.match(lines[index]) return match and match[2].lower() == self.name class Center(Block): def __init__(self, params=""): super(Center, self).__init__("center", params) self.element = "
\n{0}\n
" class Verse(Block): def __init__(self, params=""): super(Verse, self).__init__("verse", params) self.element = "

\n{0}\n

" def add_child(self, node): self.children.append(node) def to_html(self): children = [child.to_html() for child in self.children] return self.element.format("
".join(children)) class Quote(Block): def __init__(self, params=""): super(Quote, self).__init__("quote", params) self.element = "
\n{0}\n
" class Export(Block): def __init__(self, language="", params=""): super(Export, self).__init__("export", params) self.language = language self.escape = self.language.upper() != "HTML" self.parsed_nodes = () def to_html(self): if not self.escape: return super(Export, self).to_html() return "" class Src(Block): def __init__(self, language="", params="", highlight=False): super(Src, self).__init__("src", params) self.language = language self.highlight_code = highlight self.element = "
\n{1}\n
" self.needparse = False self.escape = False self.parsed_nodes = () def add_child(self, node): self.children.append(node) def highlight(self, language, text): return src_highlight(language, text) def to_html(self): text = "\n".join([child.to_html() for child in self.children]) if self.highlight_code: return self.highlight(self.language, dedent(text)) if not self.language: return "
\n{0}\n
".format(dedent(text)) return self.element.format(self.language, dedent(text)) class Example(Src): def __init__(self, params="", highlight=False): super(Example, self).__init__("example", params, highlight) self.name = "example" class BlockResult(Parser): def __init__(self): super(BlockResult, self).__init__() self.element = "
\n{0}\n
" @classmethod def match(cls, line): match = BLOCK_RESULT_REGEXP.match(line) if not match: return return cls() def matchend(self, index, lines): return not BLOCK_RESULT_CONTENT_REGEXP.match(lines[index]) def parse(self, index, lines): match = BLOCK_RESULT_CONTENT_REGEXP.match(lines[index]) return self.inlinetext(match[2]), index class ListItem(Parser): def __init__(self, status=None, checkbox="HTML"): super(ListItem, self).__init__() self.status = status self.checkbox = checkbox self.element = "
  • \n{0}\n
  • " @classmethod def match(cls, line): status = None content = line status_match = LIST_STATUS_REGEXP.match(line) if status_match: status, content = status_match[1], content[len("[ ] "):] node = cls(status) node.add_child(node.inlinetext(content)) return node def set_status(self): if not self.checkbox: return if self.checkbox == "HTML": if self.status == "X": node = self.inlinetext( '') else: node = self.inlinetext('') node.needparse = False node.escape = False else: node = self.inlinetext("=[{0}]=".format(self.status)) if not self.children: self.children.append(node) return self.children[0].children = [node] + self.children[0].children def to_html(self): if self.status is not None: self.set_status() return super(ListItem, self).to_html() class DescriptiveItem(ListItem): def __init__(self, title="", status=""): super(DescriptiveItem, self).__init__(title, status) self.element = "
    \n{0}\n
    " class List(Parser): def __init__(self, items=[]): super(List, self).__init__() self.children = items @classmethod def match(cls, line): match = UnorderList.match(line) if match: return match match = OrderList.match(line) if match: return match return Descriptive.match(line) def add_child(self, node): if self.is_list(node) and node.level == self.level: self.children.append(node.children[0]) return last = self.last_child() last.add_child(node) class Descriptive(List): def __init__(self, items=[]): super(Descriptive, self).__init__(items) self.element = "
    \n{0}\n
    " @classmethod def match(cls, line): match = LIST_DESCRIPTIVE_REGEXP.match(line) if not match: return title = DescriptiveItem.match(match[3]) return cls([title]) class UnorderList(List): def __init__(self, items=[]): super(UnorderList, self).__init__(items) self.element = "" @classmethod def match(cls, line): match = LIST_UNORDER_REGEXP.match(line) if not match: return title = ListItem.match(match[4]) return cls([title]) class OrderList(List): def __init__(self, items=[]): super(OrderList, self).__init__(items) self.element = "
      \n{0}\n
    " @classmethod def match(cls, line): match = LIST_ORDER_REGEXP.match(line) if not match: return title = ListItem.match(match[4]) return cls([title]) class TableColumn(Parser): def __init__(self, content="", header=False): super(TableColumn, self).__init__(content) self.header = header self.parsed_nodes = () def add_child(self, child): self.children.append(child) def reset(self): self.header = True def to_html(self): self.element = "{0}" if self.header else "{0}" return super(TableColumn, self).to_html() class TableRow(Parser): def __init__(self, header=False): super(TableRow, self).__init__() self.is_sep = False self.header = header self.element = "\n{0}\n" self.parsed_nodes = ("tablecolumn", ) @classmethod def match(cls, line): match = TABLE_ROW_REGEXP.match(line) if not match: return row = cls() row.is_sep = bool(TABLE_SEP_REGEXP.match(line)) row.preparse(match[2].strip("|").split("|")) return row def add_child(self, child): self.children.append(child) def parse_tablecolumn(self, index, lines): return TableColumn(lines[index].strip(), self.header), index def reset(self): self.header = True for column in self.children: column.reset() class Table(Parser): def __init__(self, keyword=None): super(Table, self).__init__() self.element = "\n{0}\n
    " self.keyword = keyword self.parsed_nodes = ("tablerow", ) @classmethod def match(cls, line): row = TableRow.match(line) if not row: return table = cls() if row.is_sep: return table table.add_child(row) return table def matchend(self, index, lines): return not TABLE_ROW_REGEXP.match(lines[index]) def reset(self): first = self.first_child() if first and first.header: return for row in self.children: row.reset() def add_child(self, child): if child.is_sep: return self.reset() self.children.append(child) def parse_tablerow(self, index, lines): return TableRow.match(lines[index]), index class Keyword(Parser): def __init__(self, key, value=""): super(Keyword, self).__init__() self.key = key self.value = value def options(self): results = {} for line in self.value.split(" "): if not line: continue m = line.split(":", 1) k = m[0] if not k: continue results[k] = "" if len(m) == 1 else m[1] return results def properties(self): results = {} line = self.value.strip() if not line: return results m = line.split(" ", 1) k = m[0] if not k: return results results[k] = "" if len(m) == 1 else m[1] return results @classmethod def match(cls, line): match = KEYWORD_REGEXP.match(line) if not match: return return cls(match[2], match[4]) def to_html(self): return "" class Paragraph(Parser): def __init__(self, content=""): super(Paragraph, self).__init__(content) self.element = "

    \n{0}\n

    " self.parsed_nodes = () def add_child(self, node): self.children.append(node) class Section(Parser): def __init__(self, headline): super(Section, self).__init__() self.headline = headline @property def stars(self): return self.headline.stars def add_child(self, node): last = self.last_child() if not last: self.children.append(node) return if node.stars > last.stars: last.add_child(node) return self.children.append(node) def to_html(self): text = "
  • " text += "{1}".format( self.headline.id(), self.headline.toc(), ) if not self.children: return text + "
  • " text += "\n\n".format( "\n".join([child.to_html() for child in self.children])) return text class Toc(Parser): def __init__(self): super(Toc, self).__init__() self.element = ( '
    ' '

    Table of Contents

    ' '
    ' '\n\n
    ') def add_child(self, node): last = self.last_child() if not last: self.children.append(node) return if node.stars > last.stars: last.add_child(node) return if node.stars < last.stars: last.add_child(node) return self.children.append(node) def to_html(self): if not self.children: return "" return super(Toc, self).to_html() class Document(Parser): def __init__(self, content, offset=0, highlight=False, **options): super(Document, self).__init__(content) self.offset = offset self.highlight = highlight self.options = options self.properties = {} self.toc = Toc() def _is_true(self, value): return value in ("true", "t", "1", True, 1) def section(self, node): return Section(node) def parse_keyword(self, index, lines): block, index = super(Document, self).parse_keyword(index, lines) if not block: return block, index if block.key == "OPTIONS": self.options.update(**block.options()) elif block.key == "PROPERTY": self.properties.update(**block.properties()) else: self.properties[block.key] = block.value return block, index def parse_headline(self, index, lines): block, index = super(Document, self).parse_headline(index, lines) if not block: return block, index block.stars = block.stars + self.offset todo_keywords = self.properties.get("TODO") if todo_keywords: block.todo_keywords = todo_keywords.split(" ") s = block.title.split(" ", 1) if len(s) > 1 and s[0] in block.todo_keywords: block.keyword = s[0] block.title = s[1] self.toc.add_child(self.section(block)) return block, index def parse_block(self, index, lines): block, index = super(Document, self).parse_block(index, lines) if not block: return block, index if self.is_src(block): block.highlight_code = self.highlight return block, index def to_html(self): text = super(Document, self).to_html() if self._is_true(self.options.get("toc")): return self.toc.to_html() + "\n" + text return text