WOS的txt文件转换
import pandas as pd
# 定义有效的标签列表
VALID_TAGS = ['PT', 'AU', 'AF', 'TI', 'SO', 'LA', 'DT', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'SU', 'BP', 'EP', 'DI', 'PG', 'WC', 'WE', 'SC', 'GA', 'UT', 'PM', 'DA', 'ER']
def extract_info(text):
records = []
current_record = {}
current_tag = None
lines = text.splitlines()
for line in lines:
# 检查行首是否无空格
if line and not line.startswith(" "):
line = line.strip()
if line == 'ER':
# 一条记录结束,添加到记录列表中
records.append(current_record)
current_record = {}
current_tag = None
continue
# 检查行首是否为大写字母开头,第二个字符为大写字母或数字,第三个字符为空格
if len(line) >= 3 and line[0].isupper() and (line[1].isupper() or line[1].isdigit()) and line[2] == " ":
tag = line[:3].strip()
if tag in VALID_TAGS:
info = line[3:].strip()
current_tag = tag
if tag in current_record:
if isinstance(current_record[tag], list):
current_record[tag].append(info)
else:
current_record[tag] = [current_record[tag], info]
else:
current_record[tag] = info
elif line and current_tag:
# 无标签行,追加到当前标签的信息中
if isinstance(current_record[current_tag], list):
current_record[current_tag][-1] += " " + line
else:
current_record[current_tag] += " " + line
return records
# 从文件中读取文本内容
file_path = r"input.txt"
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
except FileNotFoundError:
print(f"文件 {file_path} 未找到,请检查文件路径是否正确。")
else:
# 提取信息
result = extract_info(text)
# 转换为 DataFrame 查看
df = pd.DataFrame(result)
print(df)
# 保存为 Excel 文件
output_path = r"output.xlsx"
df.to_excel(output_path, index=False)
print(f"数据已成功保存到 {output_path}")
保存表格如下
CNKI的Refworks文件转换
import pandas as pd
# 定义文件路径
file_path = r"input.txt"
output_path = r"output.xlsx"
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 分割不同文章
articles = content.split('\n\n')
# 定义标签
labels = ['RT', 'SR', 'A1', 'AD', 'T1', 'JF', 'YR', 'IS', 'vo', 'OP', 'K1', 'AB', 'SN', 'CN', 'LA', 'DS', 'LK', 'DO']
# 存储每篇文章的数据
data = []
for article in articles:
article_data = {label: None for label in labels}
lines = article.strip().split('\n')
for line in lines:
line = line.strip()
if line:
parts = line.split(' ', 1)
if len(parts) == 2:
label, value = parts
label = label.strip()
value = value.strip()
if label in labels:
article_data[label] = value
data.append(article_data)
# 创建 DataFrame
df = pd.DataFrame(data, columns=labels)
# 导出为 Excel 文件
df.to_excel(output_path, index=False)