#encoding=utf-8
import csv
import lxml.etree
from lxml import etree
from io import StringIO
import xml.etree.ElementTree as et
header=('type','ID','Text','PublishData','Code','Keyword')
with open('/home/henson/Desktop/001/output.csv', 'w',encoding='utf-16') as outfile:
writer = csv.writer(outfile)
writer.writerow((header))
#tree = etree.parse("/home/henson/Desktop/001/666.xml")
# root = lxml.etree.fromstring(infile)
#parser = etree.XMLParser(ns_clean=True)
#tree = etree.parse(("/home/henson/Desktop/001/666.xml"), parser)
#a=etree.tostring(tree,encoding='utf-8')
infile='/home/henson/Desktop/001/666.xml'
tree=et.ElementTree(file=infile)
root=tree.getroot()
for PeriodicalPaper in root:
#print(PeriodicalPaper)
for Text in PeriodicalPaper:
Text=Text.findall('Keyword')
#Keyword=Text.findall('Keyword')
print(Text)
row=Text
writer.writerow(row)
import csv
import lxml
from bs4 import BeautifulSoup
soup=BeautifulSoup('/home/henson/Desktop/001/666.xml','lxml')
header=('type','ID','Text','PublishData','Code','Keyword')
with open('output.csv', 'w',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow((header))
#xmlDataFileHandler = open(infile, 'r+')
#xmlstring = xmlDataFileHandler.read().encode()
#print(xmlstring)
#root = etree.fromstring(xmlstring)
#print(etree.tostring(root))
#for PeriodicalPaper in root:
# print(PeriodicalPaper.tag)
# keyword = PeriodicalPaper.find('Keyword')
# print(keyword)
# writer.writerow(keyword)
print(soup.Text)
from xml.dom.minidom import parseString
def parseXML(fpath):
tupleList = []
content = open(r'/home/henson/Desktop/001/666.xml',"r+").read()
try:
xmldoc = parseString(content)
except:
print ("ill formed xml file")
DocumentList = xmldoc.getElementsByTagName('PeriodicalPaper')
for doc in DocumentList:
Text = doc.getElementsByTagName('Text')
ID = doc.getElementsByTagName('ID')
Keyword = doc.getElementsByTagName('Keywords')
tuple = [Text,ID,Keyword]
tupleList.append(tuple)
return tupleList
if __name__ == "__main__":
ls = parseXML(r"/home/henson/Desktop/001/666.xml")
for em in ls:
for e in em:
print (e[0].firstChild.data)
xml转csv,研究了一天都不知道怎么弄,用了lxml解析,但是还是无法输出中文,只有[][][][]用getElementsByTagName来获取,但不是我要的格式.
感觉就是很简单,但又不是那么简单,废了一天也没搞出来,比我预期的还要多时间,挫败感十足。