1.架构:.NET Core 2.0 控制台应用程序 + DotNetSpider + Nlog日志
架构:
引用DotNetSpider2.Core 以及 DotNetSpider2.Extension
框架是:.NET Core 2.0 控制台应用程序
2.创建实体类 JuzimiListEntity 继承 SpiderEntity类
using DotnetSpider.Extension.Model;
using System;
using System.Collections.Generic;
using System.Text;
namespace Ex003
{
public class JuzimiListEntity:SpiderEntity
{
public string xlistju { get; set; }
public override string ToString()
{
return $"句子迷:{ xlistju}";
}
}
}
3.创建用来处理页面数据的类 并且 该类 继承 BasePageProcessor类
using DotnetSpider.Core;
using DotnetSpider.Core.Processor;
using System;
using System.Collections.Generic;
using System.Text;
namespace Ex003
{
class JuzimiProcessor:BasePageProcessor
{
protected override void Handle(Page page)
{
List<JuzimiListEntity> list = new List<JuzimiListEntity>();
var modelHtmlList = page.Selectable.XPath(".//div[@class='views-field-phpcode']").Nodes();
foreach (var modelHtml in modelHtmlList)
{
JuzimiListEntity entity = new JuzimiListEntity();
var xlistju = modelHtml.XPath(".//a[@class='xlistju']").GetValue(DotnetSpider.Core.Selector.ValueOption.InnerText).Trim().Replace("<br>", string.Empty).Replace("\n", string.Empty).Replace("\t", string.Empty);
entity.xlistju = xlistju.ToString();
list.Add(entity);
}
page.AddResultItem("JuzimiList", list);
}
}
}
4.创建管道类 继承 BasePipeline类
using DotnetSpider.Core;
using DotnetSpider.Core.Pipeline;
using NLog;
using System;
using System.Collections.Generic;
using System.Text;
namespace Ex003
{
class JuzimiPipe : BasePipeline
{
new static Logger Logger = LogManager.GetCurrentClassLogger();
public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider)
{
foreach (var relusultItem in resultItems)
{
Console.WriteLine((relusultItem.Results["JuzimiList"] as List<JuzimiListEntity>).Count);
foreach (var item in relusultItem.Results["JuzimiList"] as List<JuzimiListEntity>)
{
Console.WriteLine(item);
Logger.Info("爬取的内容:" + item);
}
}
}
}
}
5.执行爬虫
using DotnetSpider.Core;
using DotnetSpider.Core.Scheduler;
using System;
using System.Collections.Generic;
namespace Ex003
{
class Program
{
/// <summary>
/// 爬取句子迷
/// </summary>
/// <param name="args"></param>
static void Main(string[] args)
{
List<Request> resList = new List<Request>();
var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" };
//循环获取29页句子迷原创句子
for (int i = 1; i < 29; i++)
{
site.AddStartUrl($"http://www.juzimi.com/original/recommend?page={i}");
}
//site.AddStartUrl($"http://www.juzimi.com/original/recommend?page=1");
var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new JuzimiProcessor())
.AddStartRequests(resList.ToArray())
.AddPipeline(new JuzimiPipe());
spider.ThreadNum = 1;
spider.Run();
}
}
}
6.源码自取
本文介绍了一个基于.NET Core 2.0和DotNetSpider的爬虫项目,用于爬取句子迷网站上的原创句子。项目采用控制台应用程序形式,并利用NLog进行日志记录。
2186

被折叠的 条评论
为什么被折叠?



