xpath和htmlparser的配合使用

最新推荐文章于 2024-05-07 11:26:02 发布

uestcyao

最新推荐文章于 2024-05-07 11:26:02 发布

阅读量4.5k

点赞数

CC 4.0 BY-SA版权

分类专栏： py web抓取分析文章标签： html xml

本文链接：https://blog.youkuaiyun.com/uestcyao/article/details/7881258

py web抓取分析专栏收录该内容

41 篇文章

订阅专栏

本文探讨了如何在Java环境中结合xpath和htmlparser处理非标准HTML文件，尤其是解决落单标签的问题。文章指出，xpath适用于标准XML，但无法很好地处理HTML的不规则性，而htmlparser能较好地解析不完整标签。然而，示例代码全为Java，缺少Python的实现。

xpath只能够处理标准的xml文件，即每个开始标签必须对应一个结束标签的情况。而htmlparser只需要处理标签即可。那么问题是怎么样把一个html文件的落单的标签处理掉呢？全市java的代码，就没有一个python的示范代码么？

/// <summary>
    /// 解析Xml文件的帮助类
    /// </summary>
    public class XMLHelper
    {
        /// <summary>
        /// 有效名称的正则表达式
        /// </summary>
        static string validName = @"^[^/$//;""/!#/)/.]+$";

        #region CovertHtmlToXml
        /// <summary>
        /// 转换html源码为xml格式
        /// </summary>
        /// <param name="html">html源码</param>
        /// <returns>xml字符串</returns>
        /// <param name="TargetTag">需转换的标记名</param>
        public static string CovertHtmlToXml(string html, string targetTag)
        {
            try
            {
                XmlDocument doc = new XmlDocument();
                XmlNode xmlDeclaration = doc.CreateXmlDeclaration("1.0", "utf-8", null);
                doc.AppendChild(xmlDeclaration);

                // 借助htmlparser解析html内容
                Parser parser = Parser.CreateParser(html, "GBK");
                // 筛选出指定的节点
                TagNameFilter tnf = new TagNameFilter(targetTag);
                NodeList nodes = parser.Parse(tnf);

                // 创建根节点
                XmlElement root = doc.CreateElement("Tags");

                TagNode tagNode = null;
                Hashtable ht = null;
                XmlAttribute attr = null;
                XmlElement parent = null;
                for (int i = 0; i < nodes.Size(); i++)
                {
                    tagNode = nodes[i] as TagNode;
                    parent = doc.CreateElement(tagNode.TagName);
                    
                    // 添加属性
                    ht = tagNode.Attributes;
                    foreach (DictionaryEntry ent in ht)
                    {
                        // 查看属性名是否合法
                        if (Regex.IsMatch(ent.Key.ToString(), validName))
                        {
                            attr = doc.CreateAttribute(ent.Key.ToString());
                            attr.Value = ent.Value.ToString();
                            parent.Attributes.Append(attr);
                        }
                    }// end foreach (DictionaryEntry ent in ht)

                    AppendChild(tagNode, parent, doc);

                    root.AppendChild(parent);
                }
                doc.AppendChild(root);

                return doc.OuterXml;

                //throw new Exception("给定的html文本必须至少包含一个" + targetTag + "节点");
            }
            catch (Exception ex)
            {
                throw new Exception("转换html内容出错:" + ex.Message);
            }
        }

        /// <summary>
        /// 添加子节点
        /// </summary>
        /// <param name="tagNode">Html的父节点</param>
        /// <param name="parent">Xml的父节点</param>
        /// <param name="doc">Xml文档对象</param>
        private static void AppendChild(INode tagNode, XmlNode parent, XmlDocument doc)
        {
            INode node = null;
            XmlNode xmlNode = null;
            XmlAttribute attr = null;
            Hashtable ht = null;

            // 判断是否包含子节点
            if (tagNode.Children != null && tagNode.Children.Size() > 0)
            {
                for (int i = 0; i < tagNode.Children.Size(); i++)
                {
                    node = tagNode.Children[i];
                    xmlNode = null;
                    attr = null;
                    ht = null;

                    // 如果是html标记节点
                    if (node is TagNode)
                    {
                        TagNode tn = node as TagNode;
                        if (Regex.IsMatch(tn.TagName, validName))
                        {
                            xmlNode = doc.CreateElement(tn.TagName);

                            // 添加属性
                            ht = tn.Attributes;
                            foreach (DictionaryEntry ent in ht)
                            {
                                // 查看属性名是否合法
                                if (Regex.IsMatch(ent.Key.ToString(), validName))
                                {
                                    attr = doc.CreateAttribute(ent.Key.ToString());
                                    attr.Value = ent.Value.ToString();
                                    xmlNode.Attributes.Append(attr);
                                }
                            }
                        }
                    }

                    // 如果是文本节点
                    if (node is TextNode)
                    {
                        xmlNode = doc.CreateTextNode((node as TextNode).ToPlainTextString());
                    }

                    if (xmlNode != null)
                    {
                        parent.AppendChild(xmlNode);
                        AppendChild(node, xmlNode, doc);
                    }
                }
            }
        }
        #endregion
    }

最近做一个项目wml。wml不支持script。所以在写xslt时候，已经做了处理。

     但是用JTidy将html转xml时候，发现jtidy也有失误的时候。。。就是对复杂的script不很很好的按照原来的结构转成xml。被迫我只好在使用jtidy时候，先过滤script。先后使用DOM，JDOM都不是很理想。原因html的容错性。。。xml比较严格。所以DOM，JDOM无法很好的解析html。最后看到HtmlParse，看了他的结构和性能。我觉得就它符合我的需求。

Go。贴我写的代码。前面拿到InputStream我就省略了。

HttpEntity entity = httpResponse.getEntity();

InputStream is = entity.getContent();

public static String getHtmlString(InputStream is) throws Exception{
   StringBuffer sb = new StringBuffer();
   InputStreamReader isr = new InputStreamReader(is, "UTF-8");
   BufferedReader in = new BufferedReader(isr);
   String inputLine;
   while ((inputLine = in.readLine()) != null) {
    sb.append(inputLine);
    sb.append("\n");
   }
   String result = sb.toString();
   return result;
}
public static String getFilterBody(String strBody) {
   // htmlparser 解析
   Parser parser = Parser.createParser(strBody, "utf-8");
   NodeList list;
   String reValue = strBody;
   try {
    list = parser.parse(null);
    visitNodeList(list);
    reValue = list.toHtml();
   } catch (ParserException e1) {

   }
   return reValue;
}
// 递归过滤
private static void visitNodeList(NodeList list) {
   for (int i = 0; i < list.size(); i++) {
    Node node = list.elementAt(i);

    if (node instanceof Tag) {
     if (node instanceof ScriptTag) {
      list.remove(i);
      continue;
     }//这里可以增加删除的Tag
    }
    NodeList children = node.getChildren();
    if (children != null && children.size() > 0)
     visitNodeList(children);
   }
}

最后

String newHtmlString = getFilterBody(getHtmlString(is));
InputStream newIs = new ByteArrayInputStream(newHtmlString.getBytes());

可以返回InputStream流了