C# 爬虫学习之猫眼电影(完整代码见最后)
1、HTTP部分
1.1 引用
using System;
using System.Net;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Configuration;
using System.Diagnostics;
using System.IO;
1.2 Get方法
public class HTTP
{
public static string GET(string url, string cookies = null, int timeout = 5000)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.ContentType = "text/html;charset=UTF-8";
request.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
request.Timeout = timeout;
if (cookies != null)
{
request.Headers.Add("Cookie", cookies);
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
}
2、数据抓取部分
2.1 步骤
- 寻找如何提交所属城市信息
页面cookies中有个ci=1属性,值为城市ID,ci=1即为北京。把ci属性通过cookies提交上去即可设置城市
- 发送Get请求
string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");
- 获取到页面HTML数据后通过正则取出所需到数据,C#使用正则需要添加引用
using System.Text.RegularExpressions;
- 取城市名
if (this.cityName == null)
{
Match cityName = Regex.Match(this.html,"<divclass=\"city-name\">([\\S]+?)<spanclass=\"caret\">");
this.cityName = cityName.Groups [1].Value;
}
- 取出影院名和地址
// 开始取影院信息 [3]=名字 [5]=地址
MatchCollection s = Regex.Matches(this.html,"<divclass=\"cinema-info\"><ahref=\"([\\S]+?)\"class.*?(}\\\">([\\S]+?)</a>).*?(地址:([\\S]+?))</p></div>.*?</div>");
for (int i = 0; i < s.Count; i++)
{
this.cinemaCount++;
Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount));
Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value));
Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value));
}
- 取页面数
// 取页数并输出数据
MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)</a>");
if (pageNumber.Count > 1)
{
this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value);
for (int m = 1; m <= this.pageCount - 1; m++)
{
string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12);
string pageData = HTTP.GET (pageUrl, this.ci);
pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", "");
this.html = pageData;
this.getCinemaData ();
}
}
2.2 结语
至此就可以把猫眼电影单个城市所有电影院名和地址全部取出了
完整代码
using System;
using System.Text.RegularExpressions;
namespace maoyan
{
public class Cinema
{
private bool isReady = false;
private bool isStart = false;
public string cityName = null;
private string html;
public int cinemaCount = 0;
public int pageCount = 0;
public string ci;
public void Ready (string ci)
{
if (this.isReady)
return;
this.html = "";
this.cinemaCount = 0;
this.cityName = null;
this.pageCount = 0;
// 当前城市第一页开始
string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");
this.ci = ci;
this.html = result;
this.isReady = true;
}
public void Start ()
{
if (this.html.Length <= 0 || this.isStart)
return;
this.isStart = true;
// 取城市名
this.getCityName ();
Console.WriteLine ("当前城市:{0}",this.cityName);
// 取页面数据
this.getCinemaData ();
// 取页面数
//this.getPageCount ();
this.isReady = false;
this.isStart = false;
}
public void getCityName ()
{
if (this.cityName == null) {
Match cityName = Regex.Match(this.html,"<divclass=\"city-name\">([\\S]+?)<spanclass=\"caret\">");
this.cityName = cityName.Groups [1].Value;
}
}
public void getCinemaData ()
{
// 开始取影院信息 [3]=名字 [5]=地址
MatchCollection s = Regex.Matches(this.html,"<divclass=\"cinema-info\"><ahref=\"([\\S]+?)\"class.*?(}\\\">([\\S]+?)</a>).*?(地址:([\\S]+?))</p></div>.*?</div>");
for (int i = 0; i < s.Count; i++)
{
this.cinemaCount++;
Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount));
Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value));
Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value));
}
}
public void getPageCount ()
{
// 取页数
MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)</a>");
if (pageNumber.Count > 1) {
this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value);
for (int m = 1; m <= this.pageCount - 1; m++) {
string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12);
string pageData = HTTP.GET (pageUrl, this.ci);
pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", "");
this.html = pageData;
this.getCinemaData ();
}
}
}
}
}
Main函数
using System;
namespace maoyan
{
class MainClass
{
public static void Main (string[] args)
{
Console.WriteLine ("------------开始------------");
Cinema mc = new Cinema ();
string ci = "ci=1";
mc.Ready (ci);
mc.Start ();
Console.WriteLine ("------------结束------------");
}
}
}
1万+

被折叠的 条评论
为什么被折叠?



