
项目特性: - 完整的Markdown编辑器,支持实时预览 - 文件管理功能,支持保存/加载/删除文件 - 正则表达式工具,支持批量文本替换 - 前后端分离架构 - 响应式设计 技术栈: - 前端:React + TypeScript + Vite - 后端:Python Flask - Markdown解析:Python-Markdown 包含组件: - WorkingMarkdownEditor: 基础功能版本 - FullMarkdownEditor: 完整功能版本 - SimpleMarkdownEditor: 简化版本
212 lines
6.2 KiB
Python
212 lines
6.2 KiB
Python
"""
|
||
Markdown解析器模块
|
||
提供Markdown文本到HTML的转换功能,支持数学公式、代码高亮等扩展
|
||
"""
|
||
|
||
import markdown
|
||
from markdown.extensions import codehilite, fenced_code, tables, toc
|
||
from mdx_math import MathExtension
|
||
import re
|
||
from typing import Dict, Any, List
|
||
|
||
|
||
class MarkdownParser:
|
||
"""
|
||
Markdown解析器类
|
||
|
||
负责将Markdown文本转换为HTML格式,支持多种扩展功能
|
||
"""
|
||
|
||
def __init__(self):
|
||
"""初始化Markdown解析器,配置所有扩展"""
|
||
self.md = markdown.Markdown(
|
||
extensions=[
|
||
'codehilite',
|
||
'fenced_code',
|
||
'tables',
|
||
'toc',
|
||
MathExtension(enable_dollar_delimiter=True),
|
||
],
|
||
extension_configs={
|
||
'codehilite': {
|
||
'css_class': 'highlight',
|
||
'use_pygments': True,
|
||
'noclasses': False,
|
||
},
|
||
'toc': {
|
||
'permalink': True,
|
||
'baselevel': 1,
|
||
}
|
||
}
|
||
)
|
||
|
||
def parse(self, content: str) -> Dict[str, Any]:
|
||
"""
|
||
解析Markdown内容为HTML
|
||
|
||
Args:
|
||
content (str): Markdown文本内容
|
||
|
||
Returns:
|
||
Dict[str, Any]: 包含HTML和元数据的字典
|
||
{
|
||
'html': str, # 转换后的HTML
|
||
'metadata': dict, # 提取的元数据
|
||
'toc': str, # 目录HTML
|
||
'word_count': int, # 字数统计
|
||
'reading_time': int, # 预计阅读时间(分钟)
|
||
}
|
||
"""
|
||
if not content:
|
||
return {
|
||
'html': '',
|
||
'metadata': {},
|
||
'toc': '',
|
||
'word_count': 0,
|
||
'reading_time': 0
|
||
}
|
||
|
||
# 重置解析器状态
|
||
self.md.reset()
|
||
|
||
# 提取元数据
|
||
metadata = self._extract_metadata(content)
|
||
|
||
# 解析为HTML
|
||
html = self.md.convert(content)
|
||
|
||
# 生成目录
|
||
toc_html = self.md.toc if hasattr(self.md, 'toc') else ''
|
||
|
||
# 计算字数统计
|
||
word_count = len(re.findall(r'\w+', content))
|
||
|
||
# 计算预计阅读时间(假设每分钟200字)
|
||
reading_time = max(1, word_count // 200)
|
||
|
||
return {
|
||
'html': html,
|
||
'metadata': metadata,
|
||
'toc': toc_html,
|
||
'word_count': word_count,
|
||
'reading_time': reading_time
|
||
}
|
||
|
||
def _extract_metadata(self, content: str) -> Dict[str, Any]:
|
||
"""
|
||
从Markdown内容中提取元数据
|
||
|
||
Args:
|
||
content (str): Markdown文本
|
||
|
||
Returns:
|
||
Dict[str, Any]: 提取的元数据
|
||
"""
|
||
metadata = {}
|
||
|
||
# 提取标题(第一个#开头的行)
|
||
title_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
||
if title_match:
|
||
metadata['title'] = title_match.group(1).strip()
|
||
|
||
# 提取副标题(第二个##开头的行)
|
||
subtitle_match = re.search(r'^##\s+(.+)$', content, re.MULTILINE)
|
||
if subtitle_match:
|
||
metadata['subtitle'] = subtitle_match.group(1).strip()
|
||
|
||
# 提取YAML前置元数据
|
||
yaml_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
||
if yaml_match:
|
||
yaml_content = yaml_match.group(1)
|
||
yaml_data = self._parse_yaml(yaml_content)
|
||
metadata.update(yaml_data)
|
||
|
||
# 提取标签(以#开头的单词)
|
||
tags = re.findall(r'#(\w+)', content)
|
||
if tags:
|
||
metadata['tags'] = tags
|
||
|
||
# 提取链接
|
||
links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
|
||
if links:
|
||
metadata['links'] = [{'text': text, 'url': url} for text, url in links]
|
||
|
||
return metadata
|
||
|
||
def _parse_yaml(self, yaml_content: str) -> Dict[str, Any]:
|
||
"""
|
||
简易YAML解析器
|
||
|
||
Args:
|
||
yaml_content (str): YAML格式的文本
|
||
|
||
Returns:
|
||
Dict[str, Any]: 解析后的键值对
|
||
"""
|
||
result = {}
|
||
lines = yaml_content.strip().split('\n')
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if ':' in line:
|
||
key, value = line.split(':', 1)
|
||
key = key.strip()
|
||
value = value.strip().strip('"\'')
|
||
|
||
# 处理数组格式
|
||
if value.startswith('[') and value.endswith(']'):
|
||
value = [item.strip().strip('"\'') for item in value[1:-1].split(',')]
|
||
|
||
result[key] = value
|
||
|
||
return result
|
||
|
||
def get_toc(self, content: str) -> str:
|
||
"""
|
||
生成目录
|
||
|
||
Args:
|
||
content (str): Markdown内容
|
||
|
||
Returns:
|
||
str: 目录HTML
|
||
"""
|
||
self.md.reset()
|
||
self.md.convert(content)
|
||
return self.md.toc if hasattr(self.md, 'toc') else ''
|
||
|
||
def extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
|
||
"""
|
||
提取代码块
|
||
|
||
Args:
|
||
content (str): Markdown内容
|
||
|
||
Returns:
|
||
List[Dict[str, str]]: 代码块列表,每个包含语言和代码
|
||
"""
|
||
pattern = r'```(\w*)\n(.*?)\n```'
|
||
matches = re.findall(pattern, content, re.DOTALL)
|
||
|
||
return [
|
||
{'language': lang.strip() or 'text', 'code': code.strip()}
|
||
for lang, code in matches
|
||
]
|
||
|
||
def extract_images(self, content: str) -> List[Dict[str, str]]:
|
||
"""
|
||
提取图片
|
||
|
||
Args:
|
||
content (str): Markdown内容
|
||
|
||
Returns:
|
||
List[Dict[str, str]]: 图片列表,每个包含alt和src
|
||
"""
|
||
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
||
matches = re.findall(pattern, content)
|
||
|
||
return [
|
||
{'alt': alt, 'src': src}
|
||
for alt, src in matches
|
||
] |