212 lines
6.2 KiB
Python
212 lines
6.2 KiB
Python
![]() |
"""
|
|||
|
Markdown解析器模块
|
|||
|
提供Markdown文本到HTML的转换功能,支持数学公式、代码高亮等扩展
|
|||
|
"""
|
|||
|
|
|||
|
import markdown
|
|||
|
from markdown.extensions import codehilite, fenced_code, tables, toc
|
|||
|
from mdx_math import MathExtension
|
|||
|
import re
|
|||
|
from typing import Dict, Any, List
|
|||
|
|
|||
|
|
|||
|
class MarkdownParser:
|
|||
|
"""
|
|||
|
Markdown解析器类
|
|||
|
|
|||
|
负责将Markdown文本转换为HTML格式,支持多种扩展功能
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
"""初始化Markdown解析器,配置所有扩展"""
|
|||
|
self.md = markdown.Markdown(
|
|||
|
extensions=[
|
|||
|
'codehilite',
|
|||
|
'fenced_code',
|
|||
|
'tables',
|
|||
|
'toc',
|
|||
|
MathExtension(enable_dollar_delimiter=True),
|
|||
|
],
|
|||
|
extension_configs={
|
|||
|
'codehilite': {
|
|||
|
'css_class': 'highlight',
|
|||
|
'use_pygments': True,
|
|||
|
'noclasses': False,
|
|||
|
},
|
|||
|
'toc': {
|
|||
|
'permalink': True,
|
|||
|
'baselevel': 1,
|
|||
|
}
|
|||
|
}
|
|||
|
)
|
|||
|
|
|||
|
def parse(self, content: str) -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
解析Markdown内容为HTML
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): Markdown文本内容
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 包含HTML和元数据的字典
|
|||
|
{
|
|||
|
'html': str, # 转换后的HTML
|
|||
|
'metadata': dict, # 提取的元数据
|
|||
|
'toc': str, # 目录HTML
|
|||
|
'word_count': int, # 字数统计
|
|||
|
'reading_time': int, # 预计阅读时间(分钟)
|
|||
|
}
|
|||
|
"""
|
|||
|
if not content:
|
|||
|
return {
|
|||
|
'html': '',
|
|||
|
'metadata': {},
|
|||
|
'toc': '',
|
|||
|
'word_count': 0,
|
|||
|
'reading_time': 0
|
|||
|
}
|
|||
|
|
|||
|
# 重置解析器状态
|
|||
|
self.md.reset()
|
|||
|
|
|||
|
# 提取元数据
|
|||
|
metadata = self._extract_metadata(content)
|
|||
|
|
|||
|
# 解析为HTML
|
|||
|
html = self.md.convert(content)
|
|||
|
|
|||
|
# 生成目录
|
|||
|
toc_html = self.md.toc if hasattr(self.md, 'toc') else ''
|
|||
|
|
|||
|
# 计算字数统计
|
|||
|
word_count = len(re.findall(r'\w+', content))
|
|||
|
|
|||
|
# 计算预计阅读时间(假设每分钟200字)
|
|||
|
reading_time = max(1, word_count // 200)
|
|||
|
|
|||
|
return {
|
|||
|
'html': html,
|
|||
|
'metadata': metadata,
|
|||
|
'toc': toc_html,
|
|||
|
'word_count': word_count,
|
|||
|
'reading_time': reading_time
|
|||
|
}
|
|||
|
|
|||
|
def _extract_metadata(self, content: str) -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
从Markdown内容中提取元数据
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): Markdown文本
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 提取的元数据
|
|||
|
"""
|
|||
|
metadata = {}
|
|||
|
|
|||
|
# 提取标题(第一个#开头的行)
|
|||
|
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
|||
|
if title_match:
|
|||
|
metadata['title'] = title_match.group(1).strip()
|
|||
|
|
|||
|
# 提取副标题(第二个##开头的行)
|
|||
|
subtitle_match = re.search(r'^##\s+(.+)$', content, re.MULTILINE)
|
|||
|
if subtitle_match:
|
|||
|
metadata['subtitle'] = subtitle_match.group(1).strip()
|
|||
|
|
|||
|
# 提取YAML前置元数据
|
|||
|
yaml_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
|||
|
if yaml_match:
|
|||
|
yaml_content = yaml_match.group(1)
|
|||
|
yaml_data = self._parse_yaml(yaml_content)
|
|||
|
metadata.update(yaml_data)
|
|||
|
|
|||
|
# 提取标签(以#开头的单词)
|
|||
|
tags = re.findall(r'#(\w+)', content)
|
|||
|
if tags:
|
|||
|
metadata['tags'] = tags
|
|||
|
|
|||
|
# 提取链接
|
|||
|
links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
|
|||
|
if links:
|
|||
|
metadata['links'] = [{'text': text, 'url': url} for text, url in links]
|
|||
|
|
|||
|
return metadata
|
|||
|
|
|||
|
def _parse_yaml(self, yaml_content: str) -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
简易YAML解析器
|
|||
|
|
|||
|
Args:
|
|||
|
yaml_content (str): YAML格式的文本
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 解析后的键值对
|
|||
|
"""
|
|||
|
result = {}
|
|||
|
lines = yaml_content.strip().split('\n')
|
|||
|
|
|||
|
for line in lines:
|
|||
|
line = line.strip()
|
|||
|
if ':' in line:
|
|||
|
key, value = line.split(':', 1)
|
|||
|
key = key.strip()
|
|||
|
value = value.strip().strip('"\'')
|
|||
|
|
|||
|
# 处理数组格式
|
|||
|
if value.startswith('[') and value.endswith(']'):
|
|||
|
value = [item.strip().strip('"\'') for item in value[1:-1].split(',')]
|
|||
|
|
|||
|
result[key] = value
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
def get_toc(self, content: str) -> str:
|
|||
|
"""
|
|||
|
生成目录
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): Markdown内容
|
|||
|
|
|||
|
Returns:
|
|||
|
str: 目录HTML
|
|||
|
"""
|
|||
|
self.md.reset()
|
|||
|
self.md.convert(content)
|
|||
|
return self.md.toc if hasattr(self.md, 'toc') else ''
|
|||
|
|
|||
|
def extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
|
|||
|
"""
|
|||
|
提取代码块
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): Markdown内容
|
|||
|
|
|||
|
Returns:
|
|||
|
List[Dict[str, str]]: 代码块列表,每个包含语言和代码
|
|||
|
"""
|
|||
|
pattern = r'```(\w*)\n(.*?)\n```'
|
|||
|
matches = re.findall(pattern, content, re.DOTALL)
|
|||
|
|
|||
|
return [
|
|||
|
{'language': lang.strip() or 'text', 'code': code.strip()}
|
|||
|
for lang, code in matches
|
|||
|
]
|
|||
|
|
|||
|
def extract_images(self, content: str) -> List[Dict[str, str]]:
|
|||
|
"""
|
|||
|
提取图片
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): Markdown内容
|
|||
|
|
|||
|
Returns:
|
|||
|
List[Dict[str, str]]: 图片列表,每个包含alt和src
|
|||
|
"""
|
|||
|
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|||
|
matches = re.findall(pattern, content)
|
|||
|
|
|||
|
return [
|
|||
|
{'alt': alt, 'src': src}
|
|||
|
for alt, src in matches
|
|||
|
]
|