应为逻辑很简单直接上代码:
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
public class Program
{
public static List<Html_a> html_As = new List<Html_a>();
/// <summary>
/// 网络请求:请求方式为Get
/// </summary>
/// <param name="Url"> 请求地址</param>
/// <returns>返回结果</returns>
public static string HttpGet(string Url)
{
try
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "GET";
request.ContentType = "text/html;charset=gb2312";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("gb2312"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
catch
{
Thread.Sleep(100);
return HttpGet(Url);
}
}
static void Main(string[] args)
{
GetMsg("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/");
}
public static void GetMsg(string Url)
{
var shenarr = GetShen(HttpGet(Url + "index.html"));
for (var shen_i=2;shen_i<shenarr.Count;shen_i++)
{
Html_a sen = shenarr[shen_i];
Wreiterl(sen);
if (sen.href == null)
{
continue;
}
var shiarr = GetShi(HttpGet(Url + sen.href));
for (var shi_i= 0; shi_i < shiarr.Count; shi_i++)
{
Html_a shi = shiarr[shi_i];
shi.sjcode = sen.code;
Wreiterl(shi);
if (shi.href == null)
{

本文介绍了一种使用C#进行网页爬取的方法,通过分析网页结构,利用正则表达式和HtmlAgilityPack库解析HTML,实现对中国国家统计局网站上的行政区划数据的抓取。从省级开始逐级抓取至街道级行政区划,包括省份、城市、区县、乡镇和街道的数据。
最低0.47元/天 解锁文章

被折叠的 条评论
为什么被折叠?



