Files
mmkk/backend/utils/markdown_parser.py

212 lines
6.2 KiB
Python
Raw Normal View History

"""
Markdown解析器模块
提供Markdown文本到HTML的转换功能支持数学公式代码高亮等扩展
"""
import markdown
from markdown.extensions import codehilite, fenced_code, tables, toc
from mdx_math import MathExtension
import re
from typing import Dict, Any, List
class MarkdownParser:
"""
Markdown解析器类
负责将Markdown文本转换为HTML格式支持多种扩展功能
"""
def __init__(self):
"""初始化Markdown解析器配置所有扩展"""
self.md = markdown.Markdown(
extensions=[
'codehilite',
'fenced_code',
'tables',
'toc',
MathExtension(enable_dollar_delimiter=True),
],
extension_configs={
'codehilite': {
'css_class': 'highlight',
'use_pygments': True,
'noclasses': False,
},
'toc': {
'permalink': True,
'baselevel': 1,
}
}
)
def parse(self, content: str) -> Dict[str, Any]:
"""
解析Markdown内容为HTML
Args:
content (str): Markdown文本内容
Returns:
Dict[str, Any]: 包含HTML和元数据的字典
{
'html': str, # 转换后的HTML
'metadata': dict, # 提取的元数据
'toc': str, # 目录HTML
'word_count': int, # 字数统计
'reading_time': int, # 预计阅读时间(分钟)
}
"""
if not content:
return {
'html': '',
'metadata': {},
'toc': '',
'word_count': 0,
'reading_time': 0
}
# 重置解析器状态
self.md.reset()
# 提取元数据
metadata = self._extract_metadata(content)
# 解析为HTML
html = self.md.convert(content)
# 生成目录
toc_html = self.md.toc if hasattr(self.md, 'toc') else ''
# 计算字数统计
word_count = len(re.findall(r'\w+', content))
# 计算预计阅读时间假设每分钟200字
reading_time = max(1, word_count // 200)
return {
'html': html,
'metadata': metadata,
'toc': toc_html,
'word_count': word_count,
'reading_time': reading_time
}
def _extract_metadata(self, content: str) -> Dict[str, Any]:
"""
从Markdown内容中提取元数据
Args:
content (str): Markdown文本
Returns:
Dict[str, Any]: 提取的元数据
"""
metadata = {}
# 提取标题(第一个#开头的行)
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
if title_match:
metadata['title'] = title_match.group(1).strip()
# 提取副标题(第二个##开头的行)
subtitle_match = re.search(r'^##\s+(.+)$', content, re.MULTILINE)
if subtitle_match:
metadata['subtitle'] = subtitle_match.group(1).strip()
# 提取YAML前置元数据
yaml_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if yaml_match:
yaml_content = yaml_match.group(1)
yaml_data = self._parse_yaml(yaml_content)
metadata.update(yaml_data)
# 提取标签(以#开头的单词)
tags = re.findall(r'#(\w+)', content)
if tags:
metadata['tags'] = tags
# 提取链接
links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
if links:
metadata['links'] = [{'text': text, 'url': url} for text, url in links]
return metadata
def _parse_yaml(self, yaml_content: str) -> Dict[str, Any]:
"""
简易YAML解析器
Args:
yaml_content (str): YAML格式的文本
Returns:
Dict[str, Any]: 解析后的键值对
"""
result = {}
lines = yaml_content.strip().split('\n')
for line in lines:
line = line.strip()
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = value.strip().strip('"\'')
# 处理数组格式
if value.startswith('[') and value.endswith(']'):
value = [item.strip().strip('"\'') for item in value[1:-1].split(',')]
result[key] = value
return result
def get_toc(self, content: str) -> str:
"""
生成目录
Args:
content (str): Markdown内容
Returns:
str: 目录HTML
"""
self.md.reset()
self.md.convert(content)
return self.md.toc if hasattr(self.md, 'toc') else ''
def extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
"""
提取代码块
Args:
content (str): Markdown内容
Returns:
List[Dict[str, str]]: 代码块列表每个包含语言和代码
"""
pattern = r'```(\w*)\n(.*?)\n```'
matches = re.findall(pattern, content, re.DOTALL)
return [
{'language': lang.strip() or 'text', 'code': code.strip()}
for lang, code in matches
]
def extract_images(self, content: str) -> List[Dict[str, str]]:
"""
提取图片
Args:
content (str): Markdown内容
Returns:
List[Dict[str, str]]: 图片列表每个包含alt和src
"""
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
matches = re.findall(pattern, content)
return [
{'alt': alt, 'src': src}
for alt, src in matches
]