432 lines
12 KiB
432 lines
12 KiB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ********************************************************************************
# Copyright © 2017-2020 jianglin
# File Name: inline.py
# Author: jianglin
# Email: mail@honmaple.com
# Created: 2018-02-26 11:41:22 (CST)
# Last Update: Tuesday 2020-08-18 17:21:40 (CST)
# By:
# Description:
# ********************************************************************************
import re
import os
# _inline_regexp = r"(^|.*?(?<![/\\])){0}(.+?(?<![/\\])){0}(.*?|$)"
_inline_regexp = r"(^|.*?(?<![/\\])){0}(.+?(?<![/\\])){0}(.*?|$)"
BOLD_REGEXP = re.compile(_inline_regexp.format('\\*'))
CODE_REGEXP = re.compile(_inline_regexp.format('(?:\\=|`)'))
ITALIC_REGEXP = re.compile(_inline_regexp.format('(?:\\*\\*|\\/)'))
DELETE_REGEXP = re.compile(_inline_regexp.format('\\+'))
VERBATIM_REGEXP = re.compile(_inline_regexp.format('~'))
UNDERLINE_REGEXP = re.compile(_inline_regexp.format('_'))
PERCENT_REGEXP = re.compile(r"\[(\d+/\d+|\d+%)\]")
HR_REGEXP = re.compile(r"^\s*\-{5,}\s*")
FN_REGEXP = re.compile(r"(^|.*?(?<![/\\]))(\[fn:(.+?)\])(.*?|$)")
IMG_REGEXP = re.compile(r"^[.](png|gif|jpe?g|svg|tiff?)$")
LINK_REGEXP = re.compile(r'\[\[(.+?)\](?:\[(.+?)\])?\]')
VIDEO_REGEXP = re.compile(r"^[.](webm|mp4)$")
NEWLINE_REGEXP = re.compile(r"(^|.*?(?<![/\\]))(\\\\(\s*)$)")
BLANKLINE_REGEXP = re.compile(r"^(\s*)$")
TIMESTAMP_REGEXP = re.compile(
r"^<(\d{4}-\d{2}-\d{2})( [A-Za-z]+)?( \d{2}:\d{2})?( \+\d+[dwmy])?>")
_html_escape = (
("&", "&"),
("'", "'"),
("<", "<"),
(">", ">"),
("\"", """),
# https://github.com/tsroten/zhon/blob/develop/zhon/hanzi.py
_chinese_non_stops = (
# Fullwidth ASCII variants
# Halfwidth CJK punctuation
# CJK symbols and punctuation
# CJK angle and corner brackets
# CJK brackets and symbols/punctuation
# Other CJK symbols
# Special CJK indicators
# Dashes
# Quotation marks and apostrophe
# General punctuation
# Overscores and underscores
# Small form variants
# Latin punctuation
_chinese_stops = (
'\uFF01' # Fullwidth exclamation mark
'\uFF1F' # Fullwidth question mark
'\uFF61' # Halfwidth ideographic full stop
'\u3002' # Ideographic full stop
def html_escape(text):
for e in _html_escape:
text = text.replace(e[0], e[1])
return text
def match_chinese(ch):
if '\u4e00' <= ch <= '\u9fff':
return True
if ch in _chinese_stops:
return True
return ch in _chinese_non_stops
def match_emphasis(cls, regexp, line, index):
match = regexp.match(line, index)
if not match:
return None, index
end = match.end()
if index != 0:
prechar = line[index - 1]
border = prechar != " " and prechar not in "-({'\""
if border and not match_chinese(prechar):
return None, index
if end < len(line):
endchar = line[end]
border = endchar != " " and endchar not in "-.,:!?;'\")}["
if border and not match_chinese(endchar):
return None, index
return cls(match[2]), end - 1
class InlineParser(object):
def __init__(self, content=""):
self.content = content
self.children = []
self.element = ""
def add_child(self, child):
def parse_code(self, index, lines):
return Code.match(lines, index)
def parse_bold(self, index, lines):
return Bold.match(lines, index)
def parse_italic(self, index, lines):
return Italic.match(lines, index)
def parse_delete(self, index, lines):
return Delete.match(lines, index)
def parse_verbatim(self, index, lines):
return Verbatim.match(lines, index)
def parse_underline(self, index, lines):
return Underline.match(lines, index)
def parse_percent(self, index, lines):
return Percent.match(lines, index)
def parse_link(self, index, lines):
return Link.match(lines, index)
def parse_fn(self, index, lines):
return Fn.match(lines, index)
def parse_newline(self, index, lines):
return Newline.match(lines, index)
def parse(self, index, lines):
chars = (
("=", "code"),
("`", "code"),
("~", "verbatim"),
("_", "underline"),
("+", "delete"),
("/", "italic"),
("**", "italic"),
("*", "bold"),
("[[", "link"),
("[", "percent"),
("\\", "newline"),
char_map = dict(chars)
single_char = lines[index]
double_char = lines[index:index + 2]
for char in chars:
c1 = len(char[0]) == 1 and char[0] == single_char
c2 = len(char[0]) == 2 and char[0] == double_char
if c1 or c2:
node, num = getattr(self, "parse_" + char_map[char[0]])(
index, lines)
if node:
return node, num
if lines[index:index + 3] == "[fn":
node, num = self.parse_fn(index, lines)
if node:
return node, num
child = self.last_child()
if child and isinstance(child, Text):
child.content += single_char
return None, index
return Text(single_char), index
def last_child(self):
if len(self.children) == 0:
return self.children[-1]
def preparse(self, lines):
index = 0
while index < len(lines):
block, index = self.parse(index, lines)
index += 1
if not block:
def to_html(self):
if len(self.children) == 0 and self.content:
text = "".join([child.to_html() for child in self.children])
if self.element:
return self.element.format(text)
return text
def __str__(self):
return '{}({})'.format(self.__class__.__name__, self.content.strip())
def __repr__(self):
return self.__str__()
class Text(InlineParser):
def to_html(self):
return self.content
class Newline(InlineParser):
def match(cls, line, index):
match = NEWLINE_REGEXP.match(line, index)
if not match:
return None, index
return cls(), match.end() - 1
def to_html(self):
return "<br/>"
class Bold(InlineParser):
def __init__(self, content):
super(Bold, self).__init__(content)
self.element = "<b>{0}</b>"
def match(cls, line, index):
return match_emphasis(cls, BOLD_REGEXP, line, index)
class Code(InlineParser):
def __init__(self, content):
super(Code, self).__init__(content)
self.element = "<code>{0}</code>"
def match(cls, line, index):
return match_emphasis(cls, CODE_REGEXP, line, index)
class Italic(InlineParser):
def __init__(self, content):
super(Italic, self).__init__(content)
self.element = "<i>{0}</i>"
def match(cls, line, index):
return match_emphasis(cls, ITALIC_REGEXP, line, index)
class Delete(InlineParser):
def __init__(self, content):
super(Delete, self).__init__(content)
self.element = "<del>{0}</del>"
def match(cls, line, index):
return match_emphasis(cls, DELETE_REGEXP, line, index)
class Verbatim(InlineParser):
def __init__(self, content):
super(Verbatim, self).__init__(content)
self.element = "<code>{0}</code>"
def match(cls, line, index):
return match_emphasis(cls, VERBATIM_REGEXP, line, index)
class Underline(InlineParser):
def __init__(self, content):
super(Underline, self).__init__(content)
self.element = "<span style=\"text-decoration:underline\">{0}</span>"
def match(cls, line, index):
return match_emphasis(cls, UNDERLINE_REGEXP, line, index)
class Percent(InlineParser):
def __init__(self, content):
super(Percent, self).__init__(content)
self.element = "<code>[{0}]</code>"
def match(cls, line, index):
match = PERCENT_REGEXP.match(line, index)
if not match:
return None, index
return cls(match[1]), match.end()
class Link(InlineParser):
def __init__(self, url, desc=None):
super(Link, self).__init__(url)
self.desc = desc
def match(cls, line, index):
match = LINK_REGEXP.match(line, index)
if not match:
return None, index
return cls(match[1], match[2]), match.end()
def is_img(self):
_, ext = os.path.splitext(self.content)
return not self.desc and IMG_REGEXP.match(ext)
def is_vedio(self):
_, ext = os.path.splitext(self.content)
return not self.desc and VIDEO_REGEXP.match(ext)
def to_html(self):
if self.is_img():
return "<img src=\"{0}\"/>".format(self.content)
if self.is_vedio():
return "<video src=\"{0}\">{0}</video>".format(self.content)
if self.desc:
return '<a href="{0}">{1}</a>'.format(self.content, self.desc)
return '<a href="{0}">{1}</a>'.format(self.content, self.content)
class Fn(InlineParser):
def __init__(self, content):
super(Fn, self).__init__(content)
self.element = '<sup><a id="fnr:{0}" class="footref" href="#fn.{0}">{0}</a></sup>'
def match(cls, line, index):
match = FN_REGEXP.match(line, index)
if not match:
return None, index
return cls(match[3]), match.end()
def to_html(self):
return self.element.format(self.content)
class Timestamp(InlineParser):
def __init__(self, date="", time="", interval=None):
super(Timestamp, self).__init__()
self.date = date
self.time = time
self.interval = interval
def match(cls, line, index):
match = TIMESTAMP_REGEXP.match(line, index)
if not match:
return None, index
return cls(match[1], match[3], match[4]), match.end()
class Blankline(InlineParser):
def __init__(self):
super(Blankline, self).__init__()
def match(cls, line):
match = BLANKLINE_REGEXP.match(line)
if not match:
return cls()
def to_html(self):
return ""
class Hr(InlineParser):
def __init__(self):
super(Hr, self).__init__()
def match(cls, line):
if HR_REGEXP.match(line):
return cls()
def to_html(self):
return ""
class InlineText(InlineParser):
def __init__(self, content="", needparse=True, escape=True):
super(InlineText, self).__init__(content)
self.needparse = needparse
self.escape = escape
def to_html(self):
if self.escape:
self.content = html_escape(self.content)
if not self.needparse:
return self.content
return super(InlineText, self).to_html()