data-prepare/03-data_select_random.py

23 lines
731 B
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import random
input_path = "arxiv-metadata-oai-snapshot-date-len.json"
output_path = "arxiv-metadata-oai-snapshot--random.json"
sample_size = 10000 # 你可以改成 10000 等其他数字
# 先将所有数据加载到内存中30万条可以接受
with open(input_path, 'r') as infile:
data = [json.loads(line) for line in infile]
print(f"原始数据量:{len(data)}")
random.seed(42) #随机数种子,可以自己随便调
# 随机抽样
sampled_data = random.sample(data, sample_size)
# 保存结果
with open(output_path, 'w') as outfile:
for record in sampled_data:
outfile.write(json.dumps(record) + '\n')
print(f"已随机抽取 {sample_size} 条数据保存到 {output_path}")