XML详解
1、XML
1.1 XML特点
1.2 XML的应用
1.2.1 配置文件
1.2.2 传输数据
1.3 XML和HTML的区别
1.4 XML基本语法
1.5 XML组成部分
1.5.1 文档声明
1.5.2 标签
1.5.3 指令
1.5.4 属性
1.5.5 文本
2、约束
2.1 DTD 约束
- student.dtd
<!ELEMENT students (student*)>
<!ELEMENT student (name,age,sex)>
<!ELEMENT name (#PCDATA)>
<!ELEMENT age (#PCDATA)>
<!ELEMENT sex (#PCDATA)>
<!ATTLIST student number ID #REQUIRED>
- student.xml
<?xml version="1.0"?>
<!DOCTYPE students SYSTEM "student.dtd">
<students>
<student number="A1001">
<name>AA</name>
<age>18</age>
<sex>man</sex>
</student>
<student number="B1001">
<name>BB</name>
<age>28</age>
<sex>woman</sex>
</student>
</students>
2.1.1 DTD验证
2.2 Schema 约束
2.2.1 Schema验证
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- Demo.java
import org.xml.sax.SAXException;
import javax.xml.transform.Source;
import javax.xml.transform.stream.StreamSource;
import javax.xml.validation.Schema;
import javax.xml.validation.SchemaFactory;
import javax.xml.validation.Validator;
import java.io.File;
import java.io.IOException;
public class Demo {
public static void main(String[] args) throws SAXException{
// 创建SchemaFactory工厂
SchemaFactory sch = SchemaFactory.newInstance("http://www.w3.org/2001/XMLSchema");
// 建立验证文件对象
File schemaFile = new File("src/com/jsoup/xml/book.xsd");
// 利用SchemaFactory工厂对象,接收验证的文件对象,生成Schema对象
Schema schema = sch.newSchema(schemaFile);
// 产生对此schema的验证器
Validator validator = schema.newValidator();
// 验证的数据(准备的数据源)
Source source = new StreamSource("src/com/jsoup/xml/book.xml");
// 开始验证
try {
validator.validate(source);
System.out.println("验证成功");
} catch (IOException e) {
e.printStackTrace();
System.out.println("验证失败");
}
}
}
3、XML解析
3.1 XML解析思想
3.2 常见解析器
3.3 Jsoup解析器
3.3.1 Jsoup解析
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.net.URL;
public class Demo {
public static void main(String[] args) throws IOException {
// 解析本地xml文件
String path = Demo.class.getClassLoader().getResource("com/jsoup/xml/index.html").getPath();
// System.out.println(path);
Document document = Jsoup.parse(new File(path),"utf-8");
// System.out.println(document);
// 解析字符串
// Document doc = Jsoup.parse("<!DOCTYPE html>\n" +
// "<html lang=\"en\">\n" +
// "<head>\n" +
// " <meta charset=\"UTF-8\">\n" +
// " <title>Title</title>\n" +
// "</head>\n" +
// "<body>\n" +
// " <p>Hello world!</p>\n" +
// "</body>\n" +
// "</html>");
// System.out.println(doc);
// 解析网络资源
// Document doc1 = Jsoup.parse(new URL("https://www.baidu.com/s?wd=xml+schema"),5000);
// System.out.println(doc1);
// 根据id获取元素
// Element e1 = document.getElementById("hh");
// 根据标签获取元素
Elements e1 = document.getElementsByTag("p");
// for (Element element : e1) {
// System.out.println(element);
// System.out.println(element.text());
// }
// 根据属性获取元素
Elements e2 = document.getElementsByAttribute("name");
// for (Element element : e2) {
// System.out.println(element);
// }
// 根据属性名=属性值获取元素
Elements ee = document.getElementsByAttributeValue("name","ttt");
// for (Element element : ee) {
// System.out.println(element);
// }
// 使用css选择器获取元素
Elements select = document.select("#hh");
System.out.println(select.text()); // 获取文本值
System.out.println(select.html()); // 获取文本值,如果有标签,也会显示出来
System.out.println(select.attr("id")); // 获取id属性的属性值
}
}
3.3.2 XPath解析
import cn.wanghaomiao.xpath.model.JXDocument;
import cn.wanghaomiao.xpath.model.JXNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.File;
import java.io.IOException;
import java.util.List;
public class Demo {
public static void main(String[] args) throws IOException {
// XPath 获取元素节点的方式
// 获取Document对象
Document document = Jsoup.parse(new File(Demo.class.getClassLoader().getResource("com/jsoup/xml/index.html").getPath()),"utf-8");
// 使用XPath,需要将Document对象转成JXDocument对象,才可使用
JXDocument jxDocument = new JXDocument(document);
// JXDocument调用selN(String xpath),获取List<JXNode>对象
List<JXNode> jxNodes = jxDocument.selN("//body");
for (JXNode jxNode : jxNodes) {
// 获取element
Element element = jxNode.getElement();
System.out.println(element);
}
}
}
3.4 DOM 方式解析XML数据
3.4.1 解析XML文件的方式
3.4.2 DOM解析XML的步骤
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- 类TestDOMParse
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
public class TestDOMParse {
public static void main(String[] args) throws ParserConfigurationException, IOException, SAXException {
// 创建一个DocumentBuilderFactory的对象
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
// 创建一个DocumentBuilder对象
DocumentBuilder db = dbf.newDocumentBuilder();
// 通过DomcumentBuilder的parse()方法得到Document对象
Document document = db.parse("src/com/jsoup/xml/book.xml");
// 通过getElementsByTagName()方法获取节点的列表
NodeList bookList = document.getElementsByTagName("book");
// System.out.println(bookList.getLength()); // 获取book节点的个数
// 通过for循环遍历每一个节点
for (int i = 0; i < bookList.getLength(); i++) {
// 得到每个节点的属性和属性值
Node book = bookList.item(i);
NamedNodeMap attrs = book.getAttributes(); // 得到属性的集合
// 循环遍历每一个属性
for (int j = 0; j < attrs.getLength(); j++) {
// 得到每一个属性
Node id = attrs.item(j);
System.out.println("属性名称:" + id.getNodeName() + "\t" + id.getNodeValue());
}
}
// 得到每个节点的节点和节点值
for (int i = 0;i < bookList.getLength(); i++) {
// 得到每一个book节点
Node book = bookList.item(i);
NodeList subNode = book.getChildNodes();
// 遍历每一个book的子结点
for(int j = 0; j < subNode.getLength(); j++) {
Node childNode = subNode.item(j);
// System.out.println(childNode.getNodeName()); // 空白也算一个节点
short type = childNode.getNodeType(); // 获取节点的类型
if (type == Node.ELEMENT_NODE/*节点是元素类型*/) {
System.out.println("节点的名称:" + childNode.getNodeName() + "\t" + childNode.getTextContent());
}
}
}
}
}
3.5 SAX 方式解析XML数据
3.5.1 SAX解析XML的步骤
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- 类BookDefaultHandler继承DefaultHandler
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class BookDefaultHandler extends DefaultHandler {
// 解析xml文档开始时调用
@Override
public void startDocument() throws SAXException {
super.startDocument();
System.out.println("解析xml文档开始");
}
// 解析xml文档结束时调用
@Override
public void endDocument() throws SAXException {
super.endDocument();
System.out.println("解析xml文档结束");
}
// 解析xml文档中节点时调用
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
super.startElement(uri, localName, qName, attributes);
// System.out.println("解析xml文档中节点开始时调用");
// 如果是book节点,获取节点的属性和属性值
if ("book".equals(qName)) {
// 获取所有的属性
int count = attributes.getLength(); // 属性的个数
// 循环获取每个属性
for (int i = 0; i < count; i++) {
String attName = attributes.getQName(i); // 属性名称
String attValue = attributes.getValue(i); // 属性值
System.out.println("属性名称:" + attName + "\t" + "属性值:" + attValue);
}
} else if (!"books".equals(qName) && !"book".equals(qName)) {
System.out.print("节点名称:" + qName + "\t");
}
}
// 解析xml文档中的节点结束调用
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
super.endElement(uri, localName, qName);
// System.out.println("解析xml文档中节点结束时调用");
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
super.characters(ch, start, length);
// 获取节点的文本值
String value = new String(ch,start,length);
if(!"".equals(value.trim())) {
System.out.println(value);
}
}
}
- 类TestSAXParse
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.IOException;
public class TestSAXParse {
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException {
// 创建SAXParserFactory的对象
SAXParserFactory spf = SAXParserFactory.newInstance();
// 创建SAXParser对象(解析器)
SAXParser parser = spf.newSAXParser();
// 创建一个DefaultHandler的子类
BookDefaultHandler bdh = new BookDefaultHandler();
// 调用parse方法
parser.parse("src/com/jsoup/xml/book.xml",bdh);
}
}
3.6 JDOM 解析XML数据
3.6.1 JDOM 解析XML的步骤
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- 类TestJDOM
import org.jdom2.Attribute;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.input.SAXBuilder;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List;
public class TestJDOM {
public static void main(String[] args) throws IOException, JDOMException {
// 创建一个SAXBuilder对象
SAXBuilder sb = new SAXBuilder();
// 调用build方法,得到Document对象
Document doc = sb.build(new FileInputStream("src/com/jsoup/xml/book.xml"));
// 获取根节点
Element root = doc.getRootElement(); // books元素
// 获取根节点的直接子节点的集合
List<Element> bookEle = root.getChildren();
// 遍历集合 得到book的每一个子节点(子元素)
for (int i = 0; i < bookEle.size(); i++) {
Element book = bookEle.get(i);
// 得到属性的集合
List<Attribute> attList = book.getAttributes();
for (Attribute attribute : attList) {
System.out.println("属性的名称:" + attribute.getName() + "\t" + "属性值:" + attribute.getValue());
}
}
// 得到每一个子节点
System.out.println("\n------------------------");
for (int i = 0; i < bookEle.size(); i++) {
Element book = bookEle.get(i); // 得到每一个book节点
List<Element> subBook = book.getChildren();
// 遍历每一个节点,获取节点名称节点值
for (Element ele : subBook) {
System.out.println(ele.getName() + "\t" + ele.getValue());
}
}
}
}
3.7 DOM4J 方式解析XML数据
3.7.1 DOM4J解析XML的步骤
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- 类Book
ublic class Book {
// 私有属性
private String name;
private String author;
private double price;
public Book() {
}
public Book(String name, String author, double price) {
this.name = name;
this.author = author;
this.price = price;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
}
- 类TestDOM4J
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class TestDOM4J {
public static void main(String[] args) throws DocumentException {
// 创建SAXReader对象
SAXReader reader = new SAXReader();
// 调用read方法
Document doc = reader.read(new File("src/com/jsoup/xml/book.xml"));
// 获取根元素
Element root = doc.getRootElement();
// 通过迭代器遍历直接节点
for (Iterator<Element> iteBook = root.elementIterator();iteBook.hasNext();) {
Element bookEle = iteBook.next();
// System.out.println(bookEle.getName());
for (Iterator<Attribute> iteAtt = bookEle.attributeIterator(); iteAtt.hasNext();) {
Attribute att = iteAtt.next();
System.out.println(att.getName() + "\t" + att.getValue());
}
}
System.out.println("\n------------------------");
List<Book> bookList = new ArrayList<>(); // 创建Book的集合
for (Iterator<Element> iteBook = root.elementIterator();iteBook.hasNext();) {
// 创建Book对象
Book book = new Book(); // 因为每循环一次,
Element bookEle = iteBook.next(); // 得到每一个book
for (Iterator<Element> subbookEle = bookEle.elementIterator(); subbookEle.hasNext(); ) {
// 得到每一个子元素
Element subEle = subbookEle.next();
System.out.println(subEle.getName() + "\t" + subEle.getText());
// 封装成Book对象
// 获取节点的对象
String nodeName = subEle.getName(); // name,author,price
// 使用switch判断
switch (nodeName) {
case "name":
book.setName(subEle.getText());
break;
case "author":
book.setAuthor(subEle.getText());
break;
case "price":
book.setPrice(Double.parseDouble(subEle.getText()));
break;
}
}
// book添加到集合中
bookList.add(book);
}
// 遍历集合
System.out.println("\n-------遍历集合--------");
for (Book book : bookList) {
System.out.println(book.getName() + "\t" + book.getAuthor() + "\t" + book.getPrice());
}
}
}
3.8 XPATH技术__快速获取节点
3.8.1 准备资源
- book.xsd
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
<xs:element name="books">
<xs:complexType>
<xs:sequence>
<xs:element name="book" maxOccurs="unbounded">
<xs:complexType>
<xs:sequence>
<xs:element name="name" type="xs:string"></xs:element>
<xs:element name="author" type="xs:string"></xs:element>
<xs:element name="price" type="xs:double"></xs:element>
</xs:sequence>
<xs:attribute name="id" type="xs:positiveInteger" use="required"></xs:attribute>
</xs:complexType>
</xs:element>
</xs:sequence>
</xs:complexType>
</xs:element>
</xs:schema>
- book.xml
<?xml version="1.0" encoding="UTF-8" ?>
<books xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="book.xsd" >
<book id="1001">
<name>java开发实战</name>
<author>张小三</author>
<price>98.5</price>
</book>
</books>
- 类TestXPATH
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import java.util.List;
public class TestXPATH {
public static void main(String[] args) throws DocumentException {
// SAXReader对象
SAXReader reader = new SAXReader();
// 读取XML文件
Document doc = reader.read("src/com/jsoup/xml/book.xml");
// 得到第一个author节点
Node node = doc.selectSingleNode("//author");
System.out.println("节点名称:" + node.getName() + "\t" + node.getText());
System.out.println("\n-----------------------");
// 获取所有的author
List<Node> list = doc.selectNodes("//author");
for (Node n : list) {
System.out.println("节点名称:" + n.getName() + "\t" + n.getText());
}
// 选择有id属性的book元素
List<Attribute> attList = doc.selectNodes("//book/@id");
for (Attribute attribute : attList) {
System.out.println("属性名称:" + attribute.getName() + "\t" + attribute.getValue());
}
}
}
4、案例:使用Jsoup完成网页爬虫
4.1 网页爬虫步骤
4.2 使用XML配置爬虫程序的参数
- Crawler.xml配置文件
<?xml version="1.0" encoding="UTF-8" ?>
<Crawler>
<min>9736774</min>
<max>9736669</max>
</Crawler>
- 爬虫java代码
package com.crawler;
import cn.wanghaomiao.xpath.model.JXDocument;
import cn.wanghaomiao.xpath.model.JXNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.net.JarURLConnection;
import java.net.URL;
import java.util.List;
public class Demo {
public static void main(String[] args) {
int min = 0;
int max = 0;
// 解析XML配置文件,加载XML文件到内存,转化为Document对象
try {
Document doc = Jsoup.parse(new File(Demo.class.getClassLoader().getResource("com/crawler/Crawler.xml").getPath()), "utf-8");
// 获取XML配置信息
JXDocument jxDocument = new JXDocument(doc);
List<JXNode> minNodes = jxDocument.selN("/min");
List<JXNode> maxNodes = jxDocument.selN("/max");
// 转成Element对象
Element minEle = minNodes.get(0).getElement();
Element maxEle = minNodes.get(0).getElement();
String maxStr = maxEle.text();
String minStr = minEle.text();
min = Integer.parseInt(minStr);
max = Integer.parseInt(maxStr);
} catch (IOException e) {
e.printStackTrace();
}
// 循环爬虫
for (int i = min; i <= max ; i++) {
// 使用Jsoup获取网页HTML源文件,转为Document对象
try {
Document document = Jsoup.parse(new URL("http://daily.zhihu.com/story/" + i),1000);
System.out.println(document);
// 获取头图、标题、作者、正文
Elements elements = document.getElementsByAttributeValue("alt", "头图");
// System.out.println(elements);
Elements headerEle = document.select(".DailyHeader-title");
Elements authorEle = document.select(".author");
Elements contentEle = document.select(".content");
// 头图
String headerImg = elements.get(0).attr("src");
// 标题
String title = headerEle.text();
// 作者
String author = authorEle.text();
// 正文
String content = contentEle.text();
System.out.println("=======================");
System.out.println(content);
} catch (IOException e) {
}
}
}
}