更新数据转换功能,支持从新格式提取信息并生成多种问题模板,优化输入输出文件路径
This commit is contained in:
		@@ -23,10 +23,11 @@ def convert_to_alpaca_format(input_file, output_file):
 | 
				
			|||||||
        ]
 | 
					        ]
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    choice_text=", A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
 | 
				
			||||||
    print(f"转换数据: {input_file} -> {output_file}")
 | 
					    print(f"转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    converted_data = []
 | 
					    converted_data = []
 | 
				
			||||||
    with open(input_file, "r", encoding="utf-8") as f:
 | 
					    with open(input_file, "r", encoding="utf-8-sig") as f:
 | 
				
			||||||
        csv_reader = csv.DictReader(f)
 | 
					        csv_reader = csv.DictReader(f)
 | 
				
			||||||
        for row in csv_reader:
 | 
					        for row in csv_reader:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
@@ -44,7 +45,7 @@ def convert_to_alpaca_format(input_file, output_file):
 | 
				
			|||||||
                    "system": "你是个优秀的论文分类师",
 | 
					                    "system": "你是个优秀的论文分类师",
 | 
				
			||||||
                    "conversation": [
 | 
					                    "conversation": [
 | 
				
			||||||
                        {
 | 
					                        {
 | 
				
			||||||
                            "human": row["question"],
 | 
					                            "human": row["question"]+choice_text,
 | 
				
			||||||
                            "assistant": row["answer"]
 | 
					                            "assistant": row["answer"]
 | 
				
			||||||
                        }
 | 
					                        }
 | 
				
			||||||
                    ]
 | 
					                    ]
 | 
				
			||||||
@@ -62,19 +63,8 @@ def convert_to_alpaca_format(input_file, output_file):
 | 
				
			|||||||
    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
 | 
					    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
 | 
					 | 
				
			||||||
    # parser.add_argument(
 | 
					 | 
				
			||||||
    #     "--input",
 | 
					 | 
				
			||||||
    #     type=str,
 | 
					 | 
				
			||||||
    #     required=True,
 | 
					 | 
				
			||||||
    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
 | 
					 | 
				
			||||||
    # )
 | 
					 | 
				
			||||||
    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # args = parser.parse_args()
 | 
					    input_file = "G:\\11\\data-prepare\\eval_oc_data-26gai.csv"
 | 
				
			||||||
 | 
					    output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"  # 输出文件路径
 | 
				
			||||||
    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
 | 
					 | 
				
			||||||
    input_file = "newformat_sft_test_data.csv"
 | 
					 | 
				
			||||||
    output_file = "newformat_sft_test_data--swift-sft.jsonl"  # 输出文件路径
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    convert_to_alpaca_format(input_file, output_file)
 | 
					    convert_to_alpaca_format(input_file, output_file)
 | 
				
			||||||
@@ -2,6 +2,7 @@
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -99,7 +100,7 @@ def convert_to_alpaca_format(input_file, output_file):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_onedata2multi_type(input_file, output_file):
 | 
					def convert_onedata2multi_type(input_file, output_file, num_templates):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据,
 | 
					    读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据,
 | 
				
			||||||
    并保存为output_file
 | 
					    并保存为output_file
 | 
				
			||||||
@@ -108,6 +109,7 @@ def convert_onedata2multi_type(input_file, output_file):
 | 
				
			|||||||
    input_file: 输入文件路径
 | 
					    input_file: 输入文件路径
 | 
				
			||||||
    output_file: 输出文件路径
 | 
					    output_file: 输出文件路径
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    print(f"开始转换数据...每条数据生成{num_templates}条变体")
 | 
				
			||||||
    print(f"开始转换数据: {input_file} -> {output_file}")
 | 
					    print(f"开始转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
 | 
					    category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
 | 
				
			||||||
@@ -183,45 +185,84 @@ def convert_onedata2multi_type(input_file, output_file):
 | 
				
			|||||||
            try:
 | 
					            try:
 | 
				
			||||||
                data = json.loads(line.strip())
 | 
					                data = json.loads(line.strip())
 | 
				
			||||||
                
 | 
					                
 | 
				
			||||||
                # 检查数据结构
 | 
					                # 检查新格式的数据结构
 | 
				
			||||||
                if "system" not in data or "conversation" not in data or not data["conversation"]:
 | 
					                if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3:
 | 
				
			||||||
                    print(f"警告: 数据缺少必要字段: {data}")
 | 
					                    # 提取系统指令
 | 
				
			||||||
 | 
					                    system_instruction = ""
 | 
				
			||||||
 | 
					                    human_content = ""
 | 
				
			||||||
 | 
					                    assistant_content = ""
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					                    for msg in data["messages"]:
 | 
				
			||||||
 | 
					                        if msg["role"] == "system":
 | 
				
			||||||
 | 
					                            system_instruction = msg["content"]
 | 
				
			||||||
 | 
					                        elif msg["role"] == "user":
 | 
				
			||||||
 | 
					                            human_content = msg["content"]
 | 
				
			||||||
 | 
					                        elif msg["role"] == "assistant":
 | 
				
			||||||
 | 
					                            assistant_content = msg["content"]
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					                    # 提取标题、作者和摘要
 | 
				
			||||||
 | 
					                    extracted = extract_title_author_and_abstract(human_content)
 | 
				
			||||||
 | 
					                    title = extracted.get("title", "")
 | 
				
			||||||
 | 
					                    authors = extracted.get("authors", "")
 | 
				
			||||||
 | 
					                    abstract = extracted.get("abstract", "")
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    n = min(num_templates, len(question_templates))
 | 
				
			||||||
 | 
					                    selected_templates = random.sample(question_templates, n)
 | 
				
			||||||
 | 
					                    # 为每个问题模板创建新数据
 | 
				
			||||||
 | 
					                    for template in selected_templates:
 | 
				
			||||||
 | 
					                        formatted_question = template.format(
 | 
				
			||||||
 | 
					                            title=title,
 | 
				
			||||||
 | 
					                            authors=authors,
 | 
				
			||||||
 | 
					                            abstract=abstract,
 | 
				
			||||||
 | 
					                            category_text=category_text
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                        
 | 
				
			||||||
 | 
					                        # 创建新的数据条目(保持新格式)
 | 
				
			||||||
 | 
					                        new_data = {
 | 
				
			||||||
 | 
					                            "messages": [
 | 
				
			||||||
 | 
					                                {"role": "system", "content": system_instruction},
 | 
				
			||||||
 | 
					                                {"role": "user", "content": formatted_question},
 | 
				
			||||||
 | 
					                                {"role": "assistant", "content": assistant_content}
 | 
				
			||||||
 | 
					                            ]
 | 
				
			||||||
 | 
					                        }
 | 
				
			||||||
 | 
					                        multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 检查旧格式的数据结构
 | 
				
			||||||
 | 
					                elif "system" in data and "conversation" in data and data["conversation"]:
 | 
				
			||||||
 | 
					                    system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					                    for turn in data["conversation"]:
 | 
				
			||||||
 | 
					                        if "human" in turn and "assistant" in turn:
 | 
				
			||||||
 | 
					                            extracted = extract_title_author_and_abstract(turn["human"])
 | 
				
			||||||
 | 
					                            title = extracted.get("title", "")
 | 
				
			||||||
 | 
					                            authors = extracted.get("authors", "")
 | 
				
			||||||
 | 
					                            abstract = extracted.get("abstract", "")
 | 
				
			||||||
 | 
					                            n = min(num_templates, len(question_templates))
 | 
				
			||||||
 | 
					                            selected_templates = random.sample(question_templates, n)
 | 
				
			||||||
 | 
					                            
 | 
				
			||||||
 | 
					                            for template in selected_templates:
 | 
				
			||||||
 | 
					                                formatted_question = template.format(
 | 
				
			||||||
 | 
					                                    title=title,
 | 
				
			||||||
 | 
					                                    authors=authors,
 | 
				
			||||||
 | 
					                                    abstract=abstract,
 | 
				
			||||||
 | 
					                                    category_text=category_text
 | 
				
			||||||
 | 
					                                )
 | 
				
			||||||
 | 
					                                
 | 
				
			||||||
 | 
					                                new_data = {
 | 
				
			||||||
 | 
					                                    "system": system_instruction,
 | 
				
			||||||
 | 
					                                    "conversation": [
 | 
				
			||||||
 | 
					                                        {
 | 
				
			||||||
 | 
					                                            "human": formatted_question,
 | 
				
			||||||
 | 
					                                            "assistant": turn["assistant"]
 | 
				
			||||||
 | 
					                                        }
 | 
				
			||||||
 | 
					                                    ]
 | 
				
			||||||
 | 
					                                }
 | 
				
			||||||
 | 
					                                multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    print(f"警告: 数据格式不识别: {data}")
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
            
 | 
					            
 | 
				
			||||||
                # 获取系统指令
 | 
					 | 
				
			||||||
                system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
 | 
					 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                # 处理对话
 | 
					 | 
				
			||||||
                for turn in data["conversation"]:
 | 
					 | 
				
			||||||
                    if "human" in turn and "assistant" in turn:
 | 
					 | 
				
			||||||
                        # 提取标题、作者和摘要
 | 
					 | 
				
			||||||
                        extracted = extract_title_author_and_abstract(turn["human"])
 | 
					 | 
				
			||||||
                        title = extracted.get("title", "")
 | 
					 | 
				
			||||||
                        authors = extracted.get("authors", "")
 | 
					 | 
				
			||||||
                        abstract = extracted.get("abstract", "")
 | 
					 | 
				
			||||||
                        
 | 
					 | 
				
			||||||
                        # 为每个问题模板创建新数据
 | 
					 | 
				
			||||||
                        for template in question_templates:
 | 
					 | 
				
			||||||
                            # 格式化问题
 | 
					 | 
				
			||||||
                            formatted_question = template.format(
 | 
					 | 
				
			||||||
                                title=title,
 | 
					 | 
				
			||||||
                                authors=authors,
 | 
					 | 
				
			||||||
                                abstract=abstract,
 | 
					 | 
				
			||||||
                                category_text=category_text
 | 
					 | 
				
			||||||
                            )
 | 
					 | 
				
			||||||
                            
 | 
					 | 
				
			||||||
                            # 创建新的数据条目
 | 
					 | 
				
			||||||
                            new_data = {
 | 
					 | 
				
			||||||
                                "system": system_instruction,
 | 
					 | 
				
			||||||
                                "conversation": [
 | 
					 | 
				
			||||||
                                    {
 | 
					 | 
				
			||||||
                                        "human": formatted_question,
 | 
					 | 
				
			||||||
                                        "assistant": turn["assistant"]
 | 
					 | 
				
			||||||
                                    }
 | 
					 | 
				
			||||||
                                ]
 | 
					 | 
				
			||||||
                            }
 | 
					 | 
				
			||||||
                            multi_type_data.append(new_data)
 | 
					 | 
				
			||||||
            
 | 
					 | 
				
			||||||
            except json.JSONDecodeError:
 | 
					            except json.JSONDecodeError:
 | 
				
			||||||
                print(f"警告: 无法解析JSON行: {line}")
 | 
					                print(f"警告: 无法解析JSON行: {line}")
 | 
				
			||||||
            except Exception as e:
 | 
					            except Exception as e:
 | 
				
			||||||
@@ -248,10 +289,12 @@ if __name__ == "__main__":
 | 
				
			|||||||
    content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
 | 
					    content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
 | 
				
			||||||
    extract_title_author_and_abstract(content_text)
 | 
					    extract_title_author_and_abstract(content_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    input_file = "G:\\11\\data-prepare\\val_dataset.jsonl"
 | 
					    # input_file = "G:\\11\\data-prepare\\val_dataset.jsonl"
 | 
				
			||||||
    output_file = "G:\\11\\data-prepare\\val_dataset-m.jsonl"  # 输出文件路径
 | 
					    # output_file = "G:\\11\\data-prepare\\val_dataset-m2.jsonl"  # 输出文件路径
 | 
				
			||||||
 | 
					    input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"
 | 
				
			||||||
 | 
					    output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m2.jsonl"  # 输出文件路径    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    convert_onedata2multi_type(input_file, output_file)
 | 
					    convert_onedata2multi_type(input_file, output_file, num_templates=2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user