XML文档
Document 对象
Document 对象是一棵文档树的根,可为我们提供对文档数据的最初(或最顶层)的访问入口。
用于元素节点、文本节点、注释、处理指令等均无法存在于 document 之外,document 对象同样提供了创建这些对象的方法。Node 对象提供了一个 ownerDocument 属性,此属性可把它们与在其中创建它们的 Document 关联起来。
createElement() 方法可创建元素节点。
此方法可返回一个 Element 对象。
语法
createElement(name)
- 参数 : name
- 描述 :字符串值,这个字符串可为此元素节点规定名称。
createTextNode() 可创建文本节点。节点的标签值(data)
此方法可返回 Text 对象
语法
createTextNode(data)
- 参数 :data
- 描述 :字符串值,可规定此节点的文本。
setAttribute()给节点添加属性值(Attribute)
node.setAttribute("att_name", "arr_value")
在创建完节点(节点值)之后,还需使用下面的方法添加到指点的节点的位置下面:
prev_node.appendChild(cur_node)
这里的prev_node要添加节点的上一层节点,而cur_node即为当前要添加的节点了
遇到问题:
UnicodeDecodeError: ‘ascii’ codec can’t decode byte 0xe8 in position 54: ordinal not in range(128)
解决方法:
原因就是python的str默认是ascii编码,和unicode编码冲突,就会报这个标题错误。在代码中加上如下几句即可。
import sys
reload(sys)
sys.setdefaultencoding('utf8')
其他更多XML Document对象函数,请查看:
http://www.w3school.com.cn/xmldom/dom_document.asp
代码:
# -*- coding: utf-8 -*-
from xml.dom.minidom import Document
import os
import os.path
from PIL import Image
import sys
reload(sys)
sys.setdefaultencoding('utf8')
ann_path = "/home/ubuntu/Downloads/txt_9000/"
img_path = "/home/ubuntu/Downloads/image_9000/"
xml_path = "/home/ubuntu/Downloads/label_9000/"
if not os.path.exists(xml_path):
os.mkdir(xml_path)
def writeXml(tmp, imgname, w, h, objbud, wxml):
doc = Document()
# owner
annotation = doc.createElement('annotation')
doc.appendChild(annotation)
# owner
folder = doc.createElement('folder')
annotation.appendChild(folder)
folder_txt = doc.createTextNode("VOC2007")
folder.appendChild(folder_txt)
filename = doc.createElement('filename')
annotation.appendChild(filename)
filename_txt = doc.createTextNode(imgname)
filename.appendChild(filename_txt)
# ones#
source = doc.createElement('source')
annotation.appendChild(source)
database = doc.createElement('database')
source.appendChild(database)
database_txt = doc.createTextNode("The VOC2007 Database")
database.appendChild(database_txt)
annotation_new = doc.createElement('annotation')
source.appendChild(annotation_new)
annotation_new_txt = doc.createTextNode("PASCAL VOC2007 ")
annotation_new.appendChild(annotation_new_txt)
image = doc.createElement('image')
source.appendChild(image)
image_txt = doc.createTextNode("flickr")
image.appendChild(image_txt)
# onee#
# twos#
size = doc.createElement('size')
annotation.appendChild(size)
width = doc.createElement('width')
size.appendChild(width)
width_txt = doc.createTextNode(str(w))
width.appendChild(width_txt)
height = doc.createElement('height')
size.appendChild(height)
height_txt = doc.createTextNode(str(h))
height.appendChild(height_txt)
depth = doc.createElement('depth')
size.appendChild(depth)
depth_txt = doc.createTextNode("3")
depth.appendChild(depth_txt)
# twoe#
segmented = doc.createElement('segmented')
annotation.appendChild(segmented)
segmented_txt = doc.createTextNode("0")
segmented.appendChild(segmented_txt)
for i in range(0, len(objbud) / 5):
# threes#
object_new = doc.createElement("object")
annotation.appendChild(object_new)
name = doc.createElement('name')
object_new.appendChild(name)
name_txt = doc.createTextNode(objbud[i * 5])
name.appendChild(name_txt)
pose = doc.createElement('pose')
object_new.appendChild(pose)
pose_txt = doc.createTextNode("Unspecified")
pose.appendChild(pose_txt)
truncated = doc.createElement('truncated')
object_new.appendChild(truncated)
truncated_txt = doc.createTextNode("0")
truncated.appendChild(truncated_txt)
difficult = doc.createElement('difficult')
object_new.appendChild(difficult)
difficult_txt = doc.createTextNode("0")
difficult.appendChild(difficult_txt)
# threes-1#
bndbox = doc.createElement('bndbox')
object_new.appendChild(bndbox)
xmin = doc.createElement('xmin')
bndbox.appendChild(xmin)
xmin_txt = doc.createTextNode(objbud[i * 5 + 1])
xmin.appendChild(xmin_txt)
ymin = doc.createElement('ymin')
bndbox.appendChild(ymin)
ymin_txt = doc.createTextNode(objbud[i * 5 + 2])
ymin.appendChild(ymin_txt)
xmax = doc.createElement('xmax')
bndbox.appendChild(xmax)
xmax_txt = doc.createTextNode(objbud[i * 5 + 3])
xmax.appendChild(xmax_txt)
ymax = doc.createElement('ymax')
bndbox.appendChild(ymax)
ymax_txt = doc.createTextNode(objbud[i * 5 + 4])
ymax.appendChild(ymax_txt)
# threee-1#
# threee#
tempfile = tmp + "test.xml"
with open(tempfile, "w") as f:
f.write(doc.toprettyxml(indent="\t", newl="\n", encoding="utf-8"))
rewrite = open(tempfile, "r")
lines = rewrite.read().split('\n')
newlines = lines[1:len(lines) - 1]
fw = open(wxml, "w")
for i in range(0, len(newlines)):
fw.write(newlines[i] + '\n')
fw.close()
rewrite.close()
os.remove(tempfile)
return
for files in os.walk(ann_path):
temp = "/home/ubuntu/temp/"
if not os.path.exists(temp):
os.mkdir(temp)
for file in files[2]:
print file + "-->start!"
img_name = os.path.splitext(file)[0] + '.jpg'
fileimgpath = img_path + img_name
im = Image.open(fileimgpath)
width = int(im.size[0])
height = int(im.size[1])
filelabel = open(ann_path + file, "r")
lines = filelabel.read().split('\n')
obj = lines[:len(lines) - 1]
filename = xml_path + os.path.splitext(file)[0] + '.xml'
writeXml(temp, img_name, width, height, obj, filename)
os.rmdir(temp)
参考文献:https://blog.youkuaiyun.com/samylee/article/details/62040727