添加多个类别关键词,优化数据处理逻辑,支持从arXiv提取和筛选论文数据

This commit is contained in:
2025-07-30 23:05:31 +08:00
parent 7d15721f61
commit 40262648c4
6 changed files with 298 additions and 81 deletions

View File

@@ -50,5 +50,5 @@ def get_Composition_ratio(input_file):
if __name__ == "__main__":
# input_file = "sftdata.jsonl"
input_file = "output-26.jsonl"
input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json"
get_Composition_ratio(input_file)