htmlparser 获取html,根据htmlparser写的一个提取页面纯文本的C#程序

这是一个C#程序,用于从网页中提取文本内容并处理可能出现的乱码问题。程序使用Winista.Text.HtmlParser库解析HTML,通过检查charset来确定正确的编码,并去除JavaScript和样式。最终,它将提取的文本进行修剪和展示。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

c#的网页内容提取程序,在vs2010下调试完全通过,且无乱码现象

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.Linq;

using System.Text;

using System.Windows.Forms;

using System.Threading;

using System.IO;

using System.Net;

using Winista.Text.HtmlParser;

using Winista.Text.HtmlParser.Lex;

using Winista.Text.HtmlParser.Nodes;

using Winista.Text.HtmlParser.Util;

using Winista.Text.HtmlParser.Visitors;

using Winista.Text.HtmlParser.Filters;

using Winista.Text.HtmlParser.Tags;

using Winista.Text.HtmlParser.Http;

using System.Diagnostics;

using System.Text.RegularExpressions;

namespace testhtml

{

public partial class Form1 : Form

{

public Form1()

{

InitializeComponent();

}

private void button1_Click(object sender, EventArgs e)

{

htmlText = getData(textBox1.Text);

htmlText = delJsStyle(htmlText);

string xx = toText(htmlText);

htmlText = delspace(xx);

textBox2.Text = htmlText;

}

string htmlText = "";

private string getData(string WebUrl)

{

try

{

WebClient myWebClient = new WebClient();

myWebClient.Encoding = System.Text.Encoding.Default;

htmlText = myWebClient.DownloadString(WebUrl);

int index = htmlText.IndexOf("charset");

string tempcode = htmlText.Substring(index, 15);

if (tempcode.Contains("gbk") || tempcode.Contains("gb2312") || tempcode.Contains("GBK") || tempcode.Contains("GB2312"))

myWebClient.Encoding = System.Text.Encoding.GetEncoding("gb2312");

else

myWebClient.Encoding = System.Text.Encoding.UTF8;

htmlText = myWebClient.DownloadString(WebUrl);

}

catch (Exception ex)

{

MessageBox.Show(ex.Message + "ee");

}

if (htmlText.Trim() == "")

htmlText = "获取页面失败!";

return htmlText;

}

private string toText(string str)

{

string strParser = "";

Lexer lexer1 = new Lexer(str);

Parser parser1 = new Parser(lexer1);

//  Parser parser1 = Parser.CreateParser(textBox1.Text,"utf-8");

NodeFilter body = new TagNameFilter("BODY");

NodeList nodelistoftitle = parser1.Parse(body);

TextExtractingVisitor visitor = new TextExtractingVisitor();

nodelistoftitle.VisitAllNodesWith(visitor);

strParser = visitor.ExtractedText.ToString();

return strParser;

}

private void Form1_Load_1(object sender, EventArgs e)

{

textBox1.Text = "http://www.ybzy.cn";

}

private void button2_Click(object sender, EventArgs e)

{

}

public static string delJsStyle(string str)

{

string str1 = new Regex(@"(?m)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str, "");

//  str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str1, " ");

return new Regex(@"(?m)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str1, "");

}

public static string delspace(string str)

{

string str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str, " ");

return str1;

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值