1. 程式人生 > >使用Python3將Markdown(.md)文本轉換成 html、pdf

使用Python3將Markdown(.md)文本轉換成 html、pdf

isp break 段落 close all ict ddr tran 有序

一、Markdown中不同的文本內容會分成不同的文本塊,並通過markdown的語法控制進行文本的拼接,組成新的文件。

二、利用Python3實現(.md)文件轉換成(.html)文件

  在cmd命令行下進入(.py)文件目錄下,使用命令進行執行

  >python md2html.py <file.md> <file.html>

import sys, re

#生成器模塊
def lines(file):
    #在文本最後加一空行
    for line in file: yield line
    yield \n
def blocks(file): #生成單獨的文本塊 block = [] for line in lines(file): if line.strip(): block.append(line) elif block: yield ‘‘.join(block).strip() block = [] #文本塊處理程序 class Handler: """ 處理程序父類 """ def callback(self, prefix, name, *args): method
= getattr(self, prefix + name, None) if callable(method): return method(*args) def start(self, name): self.callback(start_, name) def end(self, name): self.callback(end_, name) def sub(self, name): def substitution(match): result
= self.callback(sub_, name, match) if result is None: result = match.group(0) return result return substitution class HTMLRenderer(Handler): """ HTML處理程序,給文本塊加相應的HTML標記 """ def start_document(self): print(<html><head><title>Python文本解析</title></head><body>) def end_document(self): print(</body></html>) def start_paragraph(self): print(<p style="color: #444;">) def end_paragraph(self): print(</p>) def start_heading(self): print(<h2 style="color: #68BE5D;">) def end_heading(self): print(</h2>) def start_list(self): print(<ul style="color: #363736;">) def end_list(self): print(</ul>) def start_listitem(self): print(<li>) def end_listitem(self): print(</li>) def start_title(self): print(<h1 style="color: #1ABC9C;">) def end_title(self): print(</h1>) def sub_emphasis(self, match): return(<em>%s</em> % match.group(1)) def sub_url(self, match): return(<a target="_blank" style="text-decoration: none;color: #BC1A4B;" href="%s">%s</a> % (match.group(1), match.group(1))) def sub_mail(self, match): return(<a style="text-decoration: none;color: #BC1A4B;" href="mailto:%s">%s</a> % (match.group(1), match.group(1))) def feed(self, data): print(data) #規則,判斷每個文本塊應該如何處理 class Rule: """ 規則父類 """ def action(self, block, handler): """ 加標記 """ handler.start(self.type) handler.feed(block) handler.end(self.type) return True class HeadingRule(Rule): """ 一號標題規則 """ type = heading def condition(self, block): """ 判斷文本塊是否符合規則 """ return not \n in block and len(block) <= 70 and not block[-1] == : class TitleRule(HeadingRule): """ 二號標題規則 """ type = title first = True def condition(self, block): if not self.first: return False self.first = False return HeadingRule.condition(self, block) class ListItemRule(Rule): """ 列表項規則 """ type = listitem def condition(self, block): return block[0] == - def action(self, block, handler): handler.start(self.type) handler.feed(block[1:].strip()) handler.end(self.type) return True class ListRule(ListItemRule): """ 列表規則 """ type = list inside = False def condition(self, block): return True def action(self, block, handler): if not self.inside and ListItemRule.condition(self, block): handler.start(self.type) self.inside = True elif self.inside and not ListItemRule.condition(self, block): handler.end(self.type) self.inside = False return False class ParagraphRule(Rule): """ 段落規則 """ type = paragraph def condition(self, block): return True class Code(Rule): ‘‘‘ 代碼框規則 高亮顯示規則 。。。 ‘‘‘ pass # 對整個文本進行解析 class Parser: """ 解析器父類 """ def __init__(self, handler): self.handler = handler self.rules = [] self.filters = [] def addRule(self, rule): """ 添加規則 """ self.rules.append(rule) def addFilter(self, pattern, name): """ 添加過濾器 """ def filter(block, handler): return re.sub(pattern, handler.sub(name), block) self.filters.append(filter) def parse(self, file): """ 解析 """ self.handler.start(document) for block in blocks(file): for filter in self.filters: block = filter(block, self.handler) for rule in self.rules: if rule.condition(block): last = rule.action(block, self.handler) if last: break self.handler.end(document) class BasicTextParser(Parser): """ 純文本解析器 """ def __init__(self, handler): Parser.__init__(self, handler) self.addRule(ListRule()) self.addRule(ListItemRule()) self.addRule(TitleRule()) self.addRule(HeadingRule()) self.addRule(ParagraphRule()) self.addFilter(r\*(.+?)\*, emphasis) self.addFilter(r(http://[\.a-zA-Z/]+), url) self.addFilter(r([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+), mail) """ 運行測試程序 """ handler = HTMLRenderer() parser = BasicTextParser(handler) parser.parse(sys.stdin)

三、利用Python3將文本轉化成pdf文件

  命令>python md2pdf.py 源文件 目標文件 [options]

Options:
    -h --help     show help document.
    -v --version  show version information.
    -o --output   translate sourcefile into html file.
    -p --print    translate sourcefile into pdf file and html file respectively.
    -P --Print    translate sourcefile into pdf file only.

import os,re
import sys,getopt
from enum import Enum
from subprocess import call
from functools import reduce

from docopt import docopt

__version__ = 1.0

# 定義三個枚舉類
# 定義表狀態
class TABLE(Enum):
    Init = 1
    Format = 2
    Table = 3

# 有序序列狀態
class ORDERLIST(Enum):
    Init = 1
    List = 2

# 塊狀態
class BLOCK(Enum):
    Init = 1
    Block = 2
    CodeBlock = 3

# 定義全局狀態,並初始化狀態
table_state = TABLE.Init
orderList_state = ORDERLIST.Init
block_state = BLOCK.Init
is_code = False
is_normal = True

temp_table_first_line = []
temp_table_first_line_str = ""

need_mathjax = False


def test_state(input):
    global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str
    Code_List = ["python\n", "c++\n", "c\n"]

    result = input

    # 構建正則表達式規則
    # 匹配塊標識
    pattern = re.compile(r```(\s)*\n)
    a = pattern.match(input)

    # 普通塊
    if  a and block_state == BLOCK.Init:
        result = "<blockquote>"
        block_state = BLOCK.Block
        is_normal = False
    # 特殊代碼塊
    elif len(input) > 4 and input[0:3] == ``` and (input[3:9] == "python" or input[3:6] == "c++" or input[3:4]== "c") and block_state == BLOCK.Init:
        block_state = BLOCK.Block
        result = "<code></br>"
        is_code = True
        is_normal = False
    # 塊結束
    elif block_state == BLOCK.Block and input == ```\n:
        if is_code:
            result = "</code>"
        else:
            result = "</blockquote>"
        block_state = BLOCK.Init
        is_code = False
        is_normal = False
    elif block_state == BLOCK.Block:
        pattern = re.compile(r[\n\r\v\f\ ])
        result = pattern.sub("&nbsp", result)
        pattern = re.compile(r\t)
        result = pattern.sub("&nbsp" * 4, result)
        result = "<span>" + result + "</span></br>"
        is_normal = False

    # 解析有序序列
    if len(input) > 2 and input[0].isdigit() and input[1] == . and orderList_state == ORDERLIST.Init:
        orderList_state = ORDERLIST.List
        result = "<ol><li>" + input[2:] + "</li>"
        is_normal = False
    elif len(input) > 2 and  input[0].isdigit() and input[1] == . and orderList_state == ORDERLIST.List:
        result = "<li>" + input[2:] + "</li>"
        is_normal = False
    elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != .):
        result = "</ol>" + input
        orderList_state = ORDERLIST.Init

    # 解析表格
    pattern = re.compile(r^((.+)\|)+((.+))$)
    match = pattern.match(input)
    if match:
        l = input.split(|)
        l[-1] = l[-1][:-1]
        # 將空字符彈出列表
        if l[0] == ‘‘:
            l.pop(0)
        if l[-1] == ‘‘:
            l.pop(-1)
        if table_state == TABLE.Init:
            table_state = TABLE.Format
            temp_table_first_line = l
            temp_table_first_line_str = input
            result = ""
        elif table_state == TABLE.Format:
            # 如果是表頭與表格主題的分割線
            if reduce(lambda a, b: a and b, [all_same(i,-) for i in l], True):
                table_state = TABLE.Table
                result = "<table><thread><tr>"
                is_normal = False
                
                # 添加表頭
                for i in temp_table_first_line:
                    result += "<th>" + i + "</th>"
                result += "</tr>"
                result += "</thread><tbody>"
                is_normal = False
            else:
                result = temp_table_first_line_str + "</br>" + input
                table_state = TABLE.Init

        elif table_state == TABLE.Table:
            result = "<tr>"
            for i in l:
                result += "<td>" + i + "</td>"
            result += "</tr>"

    elif table_state == TABLE.Table:
        table_state = TABLE.Init
        result = "</tbody></table>" + result
    elif table_state == TABLE.Format:
        pass
    
    return result

# 判斷 lst 是否全由字符 sym 構成 
def all_same(lst, sym):
    return not lst or sym * len(lst) == lst

# 處理標題
def handleTitle(s, n):
    temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"
    return temp

# 處理無序列表
def handleUnorderd(s):
    s = "<ul><li>" + s[1:]
    s += "</li></ul>"
    return s


def tokenTemplate(s, match):
    pattern = ""
    if match == *:
        pattern = "\*([^\*]*)\*"
    if match == ~~:
        pattern = "\~\~([^\~\~]*)\~\~"
    if match == **:
        pattern = "\*\*([^\*\*]*)\*\*"
    return pattern

# 處理特殊標識,比如 **, *, ~~
def tokenHandler(s):
    l = [b, i, S]
    j = 0
    for i in [**, *, ~~]:
        pattern = re.compile(tokenTemplate(s,i))
        match = pattern.finditer(s)
        k = 0
        for a in match:
            if a:
                content = a.group(1)
                x,y = a.span()
                c = 3
                if i == *:
                    c = 5
                s = s[:x+c*k] + "<" + l[j] + ">" + content + "</" + l[j] + ">" + s[y+c*k:]
                k += 1
        pattern = re.compile(r\$([^\$]*)\$)
        a = pattern.search(s)
        if a:
            global need_mathjax
            need_mathjax = True
        j += 1
    return s

# 處理鏈接
def link_image(s):
    # 超鏈接
    pattern = re.compile(r\\\[(.*)\]\((.*)\))
    match = pattern.finditer(s)
    for a in match:
        if a:
            text, url = a.group(1,2)
            x, y = a.span()
            s = s[:x] + "<a href=" + url + " target=\"_blank\">" + text + "</a>" + s[y:]

    # 圖像鏈接
    pattern = re.compile(r!\[(.*)\]\((.*)\))
    match = pattern.finditer(s)
    for a in match:
        if a:
            text, url = a.group(1,2)
            x, y = a.span()
            s = s[:x] + "<img src=" + url + " target=\"_blank\">" + "</a>" + s[y:]

    # 角標
    pattern = re.compile(r(.)\^\[([^\]]*)\])
    match = pattern.finditer(s)
    k = 0
    for a in match:
        if a:
            sym,index = a.group(1,2)
            x, y = a.span()
            s = s[:x+8*k] + sym + "<sup>" + index + "</sup>" + s[y+8*k:]
        k += 1

    return s


def parse(input):
    global block_state, is_normal
    is_normal = True
    result = input

    # 檢測當前 input 解析狀態
    result = test_state(input)
    
    if block_state == BLOCK.Block:
        return result

    # 分析標題標記 # 
    title_rank = 0
    for i in range(6, 0, -1):
        if input[:i] == #*i:
            title_rank = i
            break
    if title_rank != 0:
        # 處理標題,轉化為相應的 HTML 文本
        result = handleTitle(input, title_rank)
        return result

    # 分析分割線標記 --
    if len(input) > 2 and all_same(input[:-1], -) and input[-1] == \n:
        result = "<hr>"
        return result

    # 解析無序列表
    unorderd = [+, -]
    if result != "" and result[0] in unorderd :
        result = handleUnorderd(result)
        is_normal = False

    f = input[0]
    count = 0
    sys_q = False
    while f == >:
        count += 1
        f = input[count]
        sys_q = True
    if sys_q:
        result = "<blockquote style=\"color:#8fbc8f\"> "*count + "<b>" + input[count:] + "</b>" + "</blockquote>"*count
        is_normal = False

    # 處理特殊標記,比如 ***, ~~~
    result = tokenHandler(result)

    # 解析圖像鏈接
    result = link_image(result)
    pa = re.compile(r^(\s)*$)
    a = pa.match(input)
    if input[-1] == "\n" and is_normal == True and not a :
        result+="</br>"

    return result 


def run(source_file, dest_file, dest_pdf_file, only_pdf):
    # 獲取文件名
    file_name = source_file
    # 轉換後的 HTML 文件名
    dest_name = dest_file
    # 轉換後的 PDF 文件名
    dest_pdf_name = dest_pdf_file

    # 獲取文件後綴
    _, suffix = os.path.splitext(file_name)
    if suffix not in [".md",".markdown",".mdown","mkd"]:
        print(Error: the file should be in markdown format)
        sys.exit(1)

    if only_pdf:
        dest_name = ".~temp~.html"


    f = open(file_name, "r")
    f_r = open(dest_name, "w")

    # 往文件中填寫 HTML 的一些屬性
    f_r.write("""<style type="text/css">div {display: block;font-family: "Times New Roman",Georgia,Serif}            #wrapper { width: 100%;height:100%; margin: 0; padding: 0;}#left { float:left;             width: 10%;  height: 100%;  }#second {   float:left;   width: 80%;height: 100%;               }#right {float:left;  width: 10%;  height: 100%;             }</style><div id="wrapper"> <div id="left"></div><div id="second">""")
    f_r.write("""<meta charset="utf-8"/>""")
    
    # 逐行解析 markdwon 文件
    for eachline in f:
        result = parse(eachline)
        if result != "":
            f_r.write(result)

    f_r.write("""</br></br></div><div id="right"></div></div>""")

    # 公式支持
    global need_mathjax
    if need_mathjax:
        f_r.write("""<script type="text/x-mathjax-config">        MathJax.Hub.Config({tex2jax: {inlineMath: [[‘$‘,‘$‘], [‘\\(‘,‘\\)‘]]}});        </script><script type="text/javascript"         src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>""")
    # 文件操作完成之後記得關閉!!!
    f_r.close()
    f.close()

    # 調用擴展 wkhtmltopdf 將 HTML 文件轉換成 PDF
    if dest_pdf_name != "" or only_pdf:
        call(["wkhtmltopdf", dest_name, dest_pdf_name])
    # 如果有必要,刪除中間過程生成的 HTML 文件
    if only_pdf:
        call(["rm", dest_name])


# 主函數
def main():
    dest_file = "translation_result.html"
    dest_pdf_file = "translation_result.pdf"

    only_pdf = False

    args = docopt(__doc__, version=__version__)

    dest_file = args[<outputfile>] if args[--output] else dest_file

    dest_pdf_file = args[<outputfile>] if args[--print] or args[--Print] else ""

    run(args[<sourcefile>], dest_file, dest_pdf_file, args[--Print])


if __name__=="__main__":
    main() 

使用Python3將Markdown(.md)文本轉換成 html、pdf