dblp数据集下载地址:https://dblp.org/xml/
拿到数据集首先要把他变成csv格式的文件
xml转json格式:
import xml.sax
from xml.sax.handler import ContentHandler
from xml.sax import parse
import json
doc = open('out2.json', 'w' , encoding='utf-8')
json_str = ''
all_json = []
class article(xml.sax.ContentHandler):
def __init__(self):
self.CurrentData = ""
self.author = ""
self.title = ""
self.pages = ""
self.journal = ""
self.authorIndex = 0
self.authors = []
self.authorNum = 0
# 元素开始事件处理
def startElement(self, tag, attributes):
self.CurrentData = tag
target = ['incollection',"article",'mastersthesis']
if tag in target:
if len(self.authors) is not 0: