Files
mmkk/backend/utils/markdown_parser.py
guo liwei 9b3f959c3d Initial commit: Markdown editor with file management and regex tools
项目特性:
- 完整的Markdown编辑器,支持实时预览
- 文件管理功能,支持保存/加载/删除文件
- 正则表达式工具,支持批量文本替换
- 前后端分离架构
- 响应式设计

技术栈:
- 前端:React + TypeScript + Vite
- 后端:Python Flask
- Markdown解析:Python-Markdown

包含组件:
- WorkingMarkdownEditor: 基础功能版本
- FullMarkdownEditor: 完整功能版本
- SimpleMarkdownEditor: 简化版本
2025-08-03 06:21:02 +08:00

212 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Markdown解析器模块
提供Markdown文本到HTML的转换功能支持数学公式、代码高亮等扩展
"""
import markdown
from markdown.extensions import codehilite, fenced_code, tables, toc
from mdx_math import MathExtension
import re
from typing import Dict, Any, List
class MarkdownParser:
"""
Markdown解析器类
负责将Markdown文本转换为HTML格式支持多种扩展功能
"""
def __init__(self):
"""初始化Markdown解析器配置所有扩展"""
self.md = markdown.Markdown(
extensions=[
'codehilite',
'fenced_code',
'tables',
'toc',
MathExtension(enable_dollar_delimiter=True),
],
extension_configs={
'codehilite': {
'css_class': 'highlight',
'use_pygments': True,
'noclasses': False,
},
'toc': {
'permalink': True,
'baselevel': 1,
}
}
)
def parse(self, content: str) -> Dict[str, Any]:
"""
解析Markdown内容为HTML
Args:
content (str): Markdown文本内容
Returns:
Dict[str, Any]: 包含HTML和元数据的字典
{
'html': str, # 转换后的HTML
'metadata': dict, # 提取的元数据
'toc': str, # 目录HTML
'word_count': int, # 字数统计
'reading_time': int, # 预计阅读时间(分钟)
}
"""
if not content:
return {
'html': '',
'metadata': {},
'toc': '',
'word_count': 0,
'reading_time': 0
}
# 重置解析器状态
self.md.reset()
# 提取元数据
metadata = self._extract_metadata(content)
# 解析为HTML
html = self.md.convert(content)
# 生成目录
toc_html = self.md.toc if hasattr(self.md, 'toc') else ''
# 计算字数统计
word_count = len(re.findall(r'\w+', content))
# 计算预计阅读时间假设每分钟200字
reading_time = max(1, word_count // 200)
return {
'html': html,
'metadata': metadata,
'toc': toc_html,
'word_count': word_count,
'reading_time': reading_time
}
def _extract_metadata(self, content: str) -> Dict[str, Any]:
"""
从Markdown内容中提取元数据
Args:
content (str): Markdown文本
Returns:
Dict[str, Any]: 提取的元数据
"""
metadata = {}
# 提取标题(第一个#开头的行)
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
if title_match:
metadata['title'] = title_match.group(1).strip()
# 提取副标题(第二个##开头的行)
subtitle_match = re.search(r'^##\s+(.+)$', content, re.MULTILINE)
if subtitle_match:
metadata['subtitle'] = subtitle_match.group(1).strip()
# 提取YAML前置元数据
yaml_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
if yaml_match:
yaml_content = yaml_match.group(1)
yaml_data = self._parse_yaml(yaml_content)
metadata.update(yaml_data)
# 提取标签(以#开头的单词)
tags = re.findall(r'#(\w+)', content)
if tags:
metadata['tags'] = tags
# 提取链接
links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
if links:
metadata['links'] = [{'text': text, 'url': url} for text, url in links]
return metadata
def _parse_yaml(self, yaml_content: str) -> Dict[str, Any]:
"""
简易YAML解析器
Args:
yaml_content (str): YAML格式的文本
Returns:
Dict[str, Any]: 解析后的键值对
"""
result = {}
lines = yaml_content.strip().split('\n')
for line in lines:
line = line.strip()
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = value.strip().strip('"\'')
# 处理数组格式
if value.startswith('[') and value.endswith(']'):
value = [item.strip().strip('"\'') for item in value[1:-1].split(',')]
result[key] = value
return result
def get_toc(self, content: str) -> str:
"""
生成目录
Args:
content (str): Markdown内容
Returns:
str: 目录HTML
"""
self.md.reset()
self.md.convert(content)
return self.md.toc if hasattr(self.md, 'toc') else ''
def extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
"""
提取代码块
Args:
content (str): Markdown内容
Returns:
List[Dict[str, str]]: 代码块列表,每个包含语言和代码
"""
pattern = r'```(\w*)\n(.*?)\n```'
matches = re.findall(pattern, content, re.DOTALL)
return [
{'language': lang.strip() or 'text', 'code': code.strip()}
for lang, code in matches
]
def extract_images(self, content: str) -> List[Dict[str, str]]:
"""
提取图片
Args:
content (str): Markdown内容
Returns:
List[Dict[str, str]]: 图片列表每个包含alt和src
"""
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
matches = re.findall(pattern, content)
return [
{'alt': alt, 'src': src}
for alt, src in matches
]