将导入citespace的refworks格式和wos导出的txt文件转换为xlsx格式

蓝海岛中月

已于 2025-03-02 00:15:51 修改

阅读量367

点赞数 1

CC 4.0 BY-SA版权

文章标签： python

于 2025-03-01 23:45:19 首次发布

本文链接：https://blog.youkuaiyun.com/qq_74806995/article/details/145956124

WOS的txt文件转换

import pandas as pd

# 定义有效的标签列表
VALID_TAGS = ['PT', 'AU', 'AF', 'TI', 'SO', 'LA', 'DT', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'SU', 'BP', 'EP', 'DI', 'PG', 'WC', 'WE', 'SC', 'GA', 'UT', 'PM', 'DA', 'ER']

def extract_info(text):
    records = []
    current_record = {}
    current_tag = None

    lines = text.splitlines()
    for line in lines:
        # 检查行首是否无空格
        if line and not line.startswith(" "):
            line = line.strip()
            if line == 'ER':
                # 一条记录结束，添加到记录列表中
                records.append(current_record)
                current_record = {}
                current_tag = None
                continue

            # 检查行首是否为大写字母开头，第二个字符为大写字母或数字，第三个字符为空格
            if len(line) >= 3 and line[0].isupper() and (line[1].isupper() or line[1].isdigit()) and line[2] == " ":
                tag = line[:3].strip()
                if tag in VALID_TAGS:
                    info = line[3:].strip()
                    current_tag = tag
                    if tag in current_record:
                        if isinstance(current_record[tag], list):
                            current_record[tag].append(info)
                        else:
                            current_record[tag] = [current_record[tag], info]
                    else:
                        current_record[tag] = info
            elif line and current_tag:
                # 无标签行，追加到当前标签的信息中
                if isinstance(current_record[current_tag], list):
                    current_record[current_tag][-1] += " " + line
                else:
                    current_record[current_tag] += " " + line

    return records


# 从文件中读取文本内容
file_path = r"input.txt"
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError:
    print(f"文件 {file_path} 未找到，请检查文件路径是否正确。")
else:
    # 提取信息
    result = extract_info(text)

    # 转换为 DataFrame 查看
    df = pd.DataFrame(result)
    print(df)

    # 保存为 Excel 文件
    output_path = r"output.xlsx"
    df.to_excel(output_path, index=False)
    print(f"数据已成功保存到 {output_path}")

保存表格如下

CNKI的Refworks文件转换

import pandas as pd

# 定义文件路径
file_path = r"input.txt"
output_path = r"output.xlsx"

# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# 分割不同文章
articles = content.split('\n\n')

# 定义标签
labels = ['RT', 'SR', 'A1', 'AD', 'T1', 'JF', 'YR', 'IS', 'vo', 'OP', 'K1', 'AB', 'SN', 'CN', 'LA', 'DS', 'LK', 'DO']

# 存储每篇文章的数据
data = []
for article in articles:
    article_data = {label: None for label in labels}
    lines = article.strip().split('\n')
    for line in lines:
        line = line.strip()
        if line:
            parts = line.split(' ', 1)
            if len(parts) == 2:
                label, value = parts
                label = label.strip()
                value = value.strip()
                if label in labels:
                    article_data[label] = value

    data.append(article_data)

# 创建 DataFrame
df = pd.DataFrame(data, columns=labels)

# 导出为 Excel 文件
df.to_excel(output_path, index=False)