multi type question
This commit is contained in:
		
							
								
								
									
										259
									
								
								05-data-swfit-sft2multi_type.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								05-data-swfit-sft2multi_type.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,259 @@
 | 
				
			|||||||
 | 
					      
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def extract_title_author_and_abstract(content_text):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , 
 | 
				
			||||||
 | 
					    A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    #content_text.split("',")
 | 
				
			||||||
 | 
					    parts = content_text.split("',")
 | 
				
			||||||
 | 
					    title = parts[0].split("'")[1].strip()
 | 
				
			||||||
 | 
					    authors = parts[1].split("'")[1].strip()
 | 
				
			||||||
 | 
					    abstract = parts[2].split("'")[1].strip()
 | 
				
			||||||
 | 
					    # # for part in parts:
 | 
				
			||||||
 | 
					    # #     print(part)
 | 
				
			||||||
 | 
					    # print(title)
 | 
				
			||||||
 | 
					    # print("----------------------------------------------------------------------------------------------------------")
 | 
				
			||||||
 | 
					    # print(authors)
 | 
				
			||||||
 | 
					    # print("----------------------------------------------------------------------------------------------------------")
 | 
				
			||||||
 | 
					    # print(abstract)
 | 
				
			||||||
 | 
					    # print("----------------------------------------------------------------------------------------------------------")
 | 
				
			||||||
 | 
					    return {"title": title, "authors": authors, "abstract": abstract}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_to_alpaca_format(input_file, output_file):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    将 Swift 格式的数据转换为 Alpaca 格式
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    输入格式:
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        "system": "你是个优秀的论文分类师",
 | 
				
			||||||
 | 
					        "conversation": [
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					                "human": "Based on the title...",
 | 
				
			||||||
 | 
					                "assistant": "D"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    print(f"转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    converted_data = []
 | 
				
			||||||
 | 
					    with open(input_file, "r", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                data = json.loads(line.strip())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # 检查数据结构
 | 
				
			||||||
 | 
					                if "system" not in data or "conversation" not in data:
 | 
				
			||||||
 | 
					                    print(f"警告: 数据缺少必要字段: {data}")
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # 从 system 提取指令
 | 
				
			||||||
 | 
					                instruction = data.get("system", "")
 | 
				
			||||||
 | 
					                if not instruction:
 | 
				
			||||||
 | 
					                    instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # 处理对话
 | 
				
			||||||
 | 
					                for turn in data["conversation"]:
 | 
				
			||||||
 | 
					                    if "human" in turn and "assistant" in turn:
 | 
				
			||||||
 | 
					                        # 创建新的 Alpaca 格式数据
 | 
				
			||||||
 | 
					                        new_data = {                                        
 | 
				
			||||||
 | 
					                                    "messages": [
 | 
				
			||||||
 | 
					                                        {
 | 
				
			||||||
 | 
					                                            "role": "assistant",
 | 
				
			||||||
 | 
					                                            "content": "This is a paper titled " + turn["human"]
 | 
				
			||||||
 | 
					                                            
 | 
				
			||||||
 | 
					                                        }]}
 | 
				
			||||||
 | 
					                        converted_data.append(new_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            except json.JSONDecodeError:
 | 
				
			||||||
 | 
					                print(f"警告: 无法解析JSON行: {line}")
 | 
				
			||||||
 | 
					            except Exception as e:
 | 
				
			||||||
 | 
					                print(f"处理行时发生错误: {str(e)}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 写入输出文件
 | 
				
			||||||
 | 
					    with open(output_file, "w", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for item in converted_data:
 | 
				
			||||||
 | 
					            f.write(json.dumps(item, ensure_ascii=False) + "\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_onedata2multi_type(input_file, output_file):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据,
 | 
				
			||||||
 | 
					    并保存为output_file
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    参数:
 | 
				
			||||||
 | 
					    input_file: 输入文件路径
 | 
				
			||||||
 | 
					    output_file: 输出文件路径
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    print(f"开始转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 定义20种问题模板
 | 
				
			||||||
 | 
					    question_templates = [
 | 
				
			||||||
 | 
					        # 直接提问式
 | 
				
			||||||
 | 
					        "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 命令式
 | 
				
			||||||
 | 
					        "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 描述性引导
 | 
				
			||||||
 | 
					        "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 正式请求
 | 
				
			||||||
 | 
					        "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 摘要优先
 | 
				
			||||||
 | 
					        "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 作者强调
 | 
				
			||||||
 | 
					        "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 问题链式
 | 
				
			||||||
 | 
					        "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 简洁版
 | 
				
			||||||
 | 
					        "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 上下文嵌入
 | 
				
			||||||
 | 
					        "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 非正式口语
 | 
				
			||||||
 | 
					        "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 元素罗列
 | 
				
			||||||
 | 
					        "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 假设场景
 | 
				
			||||||
 | 
					        "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 强调关键信息
 | 
				
			||||||
 | 
					        "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 间接询问
 | 
				
			||||||
 | 
					        "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 完整句子整合
 | 
				
			||||||
 | 
					        "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 问题聚焦摘要
 | 
				
			||||||
 | 
					        "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 标题驱动
 | 
				
			||||||
 | 
					        "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 多部分查询
 | 
				
			||||||
 | 
					        "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 比较式
 | 
				
			||||||
 | 
					        "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 行动导向
 | 
				
			||||||
 | 
					        "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    multi_type_data = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    with open(input_file, "r", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for line in f:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                data = json.loads(line.strip())
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 检查数据结构
 | 
				
			||||||
 | 
					                if "system" not in data or "conversation" not in data or not data["conversation"]:
 | 
				
			||||||
 | 
					                    print(f"警告: 数据缺少必要字段: {data}")
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 获取系统指令
 | 
				
			||||||
 | 
					                system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 处理对话
 | 
				
			||||||
 | 
					                for turn in data["conversation"]:
 | 
				
			||||||
 | 
					                    if "human" in turn and "assistant" in turn:
 | 
				
			||||||
 | 
					                        # 提取标题、作者和摘要
 | 
				
			||||||
 | 
					                        extracted = extract_title_author_and_abstract(turn["human"])
 | 
				
			||||||
 | 
					                        title = extracted.get("title", "")
 | 
				
			||||||
 | 
					                        authors = extracted.get("authors", "")
 | 
				
			||||||
 | 
					                        abstract = extracted.get("abstract", "")
 | 
				
			||||||
 | 
					                        
 | 
				
			||||||
 | 
					                        # 为每个问题模板创建新数据
 | 
				
			||||||
 | 
					                        for template in question_templates:
 | 
				
			||||||
 | 
					                            # 格式化问题
 | 
				
			||||||
 | 
					                            formatted_question = template.format(
 | 
				
			||||||
 | 
					                                title=title,
 | 
				
			||||||
 | 
					                                authors=authors,
 | 
				
			||||||
 | 
					                                abstract=abstract,
 | 
				
			||||||
 | 
					                                category_text=category_text
 | 
				
			||||||
 | 
					                            )
 | 
				
			||||||
 | 
					                            
 | 
				
			||||||
 | 
					                            # 创建新的数据条目
 | 
				
			||||||
 | 
					                            new_data = {
 | 
				
			||||||
 | 
					                                "system": system_instruction,
 | 
				
			||||||
 | 
					                                "conversation": [
 | 
				
			||||||
 | 
					                                    {
 | 
				
			||||||
 | 
					                                        "human": formatted_question,
 | 
				
			||||||
 | 
					                                        "assistant": turn["assistant"]
 | 
				
			||||||
 | 
					                                    }
 | 
				
			||||||
 | 
					                                ]
 | 
				
			||||||
 | 
					                            }
 | 
				
			||||||
 | 
					                            multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            except json.JSONDecodeError:
 | 
				
			||||||
 | 
					                print(f"警告: 无法解析JSON行: {line}")
 | 
				
			||||||
 | 
					            except Exception as e:
 | 
				
			||||||
 | 
					                print(f"处理行时发生错误: {str(e)}")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 写入输出文件
 | 
				
			||||||
 | 
					    with open(output_file, "w", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for item in multi_type_data:
 | 
				
			||||||
 | 
					            f.write(json.dumps(item, ensure_ascii=False) + "\n")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
 | 
				
			||||||
 | 
					    extract_title_author_and_abstract(content_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    input_file = "G:\\11\\data-prepare\\val_dataset.jsonl"
 | 
				
			||||||
 | 
					    output_file = "G:\\11\\data-prepare\\val_dataset-m.jsonl"  # 输出文件路径
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    convert_onedata2multi_type(input_file, output_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Reference in New Issue
	
	Block a user