周末写了一个博客园博客PDF生成器,由于博客园文件上传大小的限制,我把源代码放在优快云上了(想信大家都有帐号哈),如果没有帐号的请留下邮箱,我会尽快发给你,当然如果哪位朋友能帮忙把源代码上传到博客园上更好:博客园博客PDF生成器
废话不多说,直接看生成后的PDF效果哈:
博客中图片效果:
代码比较简单,这里先简单说一下思路,先通过博客地址取得该博客的RSS信息,这是一个XML文件,把源码存在本地,然后解析这个XML文件,从中取出需要的信息,再用iTextSharp这个DLL来操作PDF,从面生成PDF文档。
下面只帖出几个主要的类,大家有兴趣可以下载源代码看:
实体类channel,类属性是从XML文件中取得的:


using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace BlogsConvert
{
public class channel
{
private string title;
private string link;
private string description;
private string language;
private DateTime lastBuildDate;
private DateTime pubDate;
private int ttl;
public string Title
{
get { return title; }
set { title = value; }
}
public string Link
{
get { return link; }
set { link = value; }
}
public string Description
{
get { return description; }
set { description = value; }
}
public string Language
{
get { return language; }
set { language = value; }
}
public DateTime LastBuildDate
{
get { return lastBuildDate; }
set { lastBuildDate = value; }
}
public DateTime PubDate
{
get { return pubDate; }
set { pubDate = value; }
}
public int Ttl
{
get { return ttl; }
set { ttl = value; }
}
}
}
实体类item(属性来自XML文件):


using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace BlogsConvert
{
public class item
{
private string title;
private string link;
private string dc_creator;
private string author;
private DateTime pubDate;
private string guid;
private string description;
public string Title
{
get { return title; }
set { title = value; }
}
public string Link
{
get { return link; }
set { link = value; }
}
public string Dc_creator
{
get { return dc_creator; }
set { dc_creator = value; }
}
public string Author
{
get { return author; }
set { author = value; }
}
public DateTime PubDate
{
get { return pubDate; }
set { pubDate = value; }
}
public string Guid
{
get { return guid; }
set { guid = value; }
}
public string Description
{
get { return description; }
set { description = value; }
}
}
}
从XML文件中提取博客信息类:


using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml.Linq;
using System.Xml;
namespace BlogsConvert
{
public class BlogsInfo
{
/// <summary>
/// 从XML文件中取得博主信息
/// </summary>
/// <param name="xmlPath"> xml文件路径 </param>
/// <returns> channel </returns>
public channel GetChannel( string xmlPath)
{
channel cha = new channel();
// 解析XML文件
XmlDocument myXml = new XmlDocument();
myXml.Load(xmlPath);
XmlNode blogs = myXml.DocumentElement;
XmlNode node = blogs.ChildNodes[ 0 ];
if (node.Name == " channel " )
{
foreach (XmlNode chanode in node.ChildNodes)
{
switch (chanode.Name)
{
case " title " :
cha.Title = chanode.InnerText;
break ;
case " link " :
cha.Link = chanode.InnerText;
break ;
case " description " :
cha.Description = chanode.InnerText;
break ;
case " language " :
cha.Language = chanode.InnerText;
break ;
case " lastBuildDate " :
cha.LastBuildDate = DateTime.Parse(chanode.InnerText);
break ;
case " pubDate " :
cha.PubDate = DateTime.Parse(chanode.InnerText);
break ;
case " ttl " :
cha.Ttl = int .Parse(chanode.InnerText);
break ;
}
if (chanode.Name == " item " )
break ;
}
}
if (cha.Title.Trim() != "" )
return cha;
return null ;
}
/// <summary>
/// 从XML文件中取得文章信息
/// </summary>
/// <param name="xmlPath"> xml文件路径 </param>
/// <returns> IList </returns>
public IList < item > GetItems( string xmlPath)
{
return GetItems(xmlPath, "" );
}
/// <summary>
/// 从XML文件中取得文章信息
/// </summary>
/// <param name="xmlPath"> xml文件路径 </param>
/// <param name="keyWord"> 按关键字提取博客信息 </param>
/// <returns> IList </returns>
public IList < item > GetItems( string xmlPath, string keyWord)
{
IList < item > itemList = new List < item > ();
item temp;
// 解析XML文件
XmlDocument myXml = new XmlDocument();
myXml.Load(xmlPath);
XmlNode blogs = myXml.DocumentElement;
XmlNode node = blogs.ChildNodes[ 0 ];
if (node.Name == " channel " )
{
foreach (XmlNode statusnode in node.ChildNodes)
{
switch (statusnode.Name)
{
case " item " :
temp = new item();
bool flag = true ;
foreach (XmlNode o in statusnode.ChildNodes)
{
if (flag)
{
switch (o.Name)
{
case " title " :
if (keyWord.Trim() != "" )
{
if ( ! o.InnerText.Contains(keyWord))
flag = false ;
}
temp.Title = o.InnerText;
break ;
case " link " :
temp.Link = o.InnerText;
break ;
case " dc:creator " :
temp.Dc_creator = o.InnerText;
break ;
case " author " :
temp.Author = o.InnerText;
break ;
case " pubDate " :
temp.PubDate = DateTime.Parse(o.InnerText);
break ;
case " guid " :
temp.Guid = o.InnerText;
break ;
case " description " :
temp.Description = o.InnerText;
break ;
}
}
}
if (temp.Link != null )
itemList.Add(temp);
break ;
}
}
}
if (itemList.Count > 0 )
return itemList;
return null ;
}
}
}
PDF文件生成类,也是本软件中最重要的一个类,其实就是iTextSharp的运用(这个DLL文件在源代码中有):


using System.Collections.Generic;
using System.Linq;
using System.Text;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
using System.Text.RegularExpressions;
namespace BlogsConvert
{
public class ToPdf:IConvert
{
#region IConvert 成员
/// <summary>
/// 转为PDF
/// </summary>
/// <param name="commonInfo"> 博主信息 </param>
/// <param name="itemList"> 文章信息 </param>
/// <param name="path"> 生成的PDF文件存放路径 </param>
public void Convert(channel commonInfo, IList < item > itemList, string path)
{
if (commonInfo != null && itemList != null )
{
// 设置页面大小
Rectangle pageSize = PageSize.A4;
// 创建文档对象
Document document = new Document(pageSize);
PdfWriter.GetInstance(document, new FileStream(path,FileMode.Create));
// 打开文档
document.Open();
// 定义字体
BaseFont bfSongTi = BaseFont.CreateFont( @" Fonts\SIMHEI.TTF " ,BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
Font font = new Font(bfSongTi, 12 );
// 定义字体
BaseFont bfSongTiBlod = BaseFont.CreateFont( @" Fonts\SIMHEI.TTF " , BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
Font fontBlod = new Font(bfSongTiBlod, 15 );
// 提示段落
Paragraph pToop = new Paragraph( new Chunk( " 本文档由程序整理生成(生成时间: " + DateTime.Now + " ) " ,fontBlod));
// 1为居中,0为居左,2为居右
pToop.Alignment = 1 ;
pToop.SpacingAfter = 20 ;
document.Add(pToop);
// 博客标题
Paragraph pTitle = new Paragraph( new Phrase(commonInfo.Title, fontBlod));
pTitle.Alignment = 1 ;
pTitle.SpacingAfter = 20 ;
document.Add(pTitle);
// 添加博客子标题
Paragraph pDescription = new Paragraph(commonInfo.Description,font);
pDescription.Alignment = 0 ;
// 行间距(倍)
pDescription.MultipliedLeading = 2 ;
pDescription.SpacingAfter = 20 ;
document.Add(pDescription);
// 博客目录
Paragraph allGuid = new Paragraph( " 目 录 " , fontBlod);
allGuid.Alignment = 1 ;
allGuid.SpacingBefore = 10 ;
document.Add(allGuid);
// 添加目录
Paragraph guid = new Paragraph( " " );
guid.MultipliedLeading = 1 ;
Anchor aTitle;
for ( int i = 0 ; i < itemList.Count;i ++ )
{
item o = itemList[i];
aTitle = new Anchor( " 第 " + (i + 1 ) + " 篇: " + o.Title,font);
aTitle.Reference = " #link " + o.PubDate.ToString();
document.Add(aTitle);
document.Add(guid);
}
document.Add(guid);
document.Add(guid);
document.Add(guid);
// 文章标题
Paragraph blogTitle;
// 文章内容
Paragraph blogContent;
// 分割线
Paragraph hr = new Paragraph( " -------------------------------------------------------------------------------------------------------- " );
hr.Alignment = 1 ;
hr.SpacingAfter = 20 ;
hr.SpacingBefore = 20 ;
// 提取图片
string Content;
Regex reg = new Regex( @" (?is)(?:<img[^>]*?src|\bbackground)=(?:(['""])(?<img>[^'"">]+)\1|(?<img>[^'""\s>]+)) " );
MatchCollection mc;
IList < string > picList;
// 内容处理
string [] ContentArray;
Anchor lTitle;
int index = 1 ;
foreach (var o in itemList)
{
lTitle = new Anchor( " 第 " + index + " 篇: " ,font);
lTitle.Name = " link " + o.PubDate.ToString();
document.Add(lTitle);
index ++ ;
blogTitle = new Paragraph(o.Title,fontBlod);
blogTitle.Alignment = 1 ;
blogTitle.MultipliedLeading = 1 ;
document.Add(blogTitle);
Content = o.Description;
Content = Content.Replace( " <p> " , " 卍 " );
Content = Content.Replace( " <br /> " , " 卍 " );
Content = Content.Replace( " <br/ /> " , " 卍 " );
mc = reg.Matches(Content);
picList = new List < string > ();
for ( int i = 0 ;i < mc.Count;i ++ )
{
Match m = mc[i];
if ( ! m.Groups[ " img " ].Value.Contains( " OutliningIndicators " ))
{
picList.Add(m.Groups[ " img " ].Value);
Content = Content.Replace(m.Groups[ " img " ].Value, " \ " /> 卍Pic " + m.Groups[ " img " ].Value + " ciP卍 < img src = \ "" );
}
}
// 去掉Html标签
Content = NoHTML(Content);
// 按文章内容生成段落
ContentArray = Content.Split( ' 卍 ' );
for ( int i = 0 ; i < ContentArray.Length; i ++ )
{
for ( int j = 0 ; j < picList.Count; j ++ )
{
if ( ContentArray[i] == " Pic " + picList[j] + " ciP " )
{
Image jpeg = Image.GetInstance(picList[j]);
if (jpeg.Width > PageSize.A4.Width)
{
jpeg.ScaleAbsolute(PageSize.A4.Width, jpeg.Width * jpeg.Height / PageSize.A4.Width);
}
jpeg.Alignment = Image.MIDDLE_ALIGN;
document.Add(jpeg);
ContentArray[i] = " PicDRJciP " ;
}
}
if (ContentArray[i] != " PicDRJciP " )
{
blogContent = new Paragraph(ContentArray[i], font);
blogContent.Alignment = 0 ;
blogContent.MultipliedLeading = 2 ;
blogContent.SpacingAfter = 10 ;
document.Add(blogContent);
}
}
document.Add(hr);
}
// 提示信息
Paragraph drj = new Paragraph( new Chunk( " 本程序由博客园——天行健(http://home.cnblogs.com/u/durongjian/)制作,如有建议请发邮件至drjchina@163.com " , font));
// 1为居中,0为居左,2为居右
drj.Alignment = 1 ;
drj.SpacingAfter = 20 ;
drj.SpacingBefore = 20 ;
document.Add(drj);
// 关闭文档
document.Close();
}
}
/// <summary>
/// 去掉HTML标签
/// </summary>
/// <param name="Htmlstring"> 带有HTML标签的字符串 </param>
/// <returns> string </returns>
public static string NoHTML( string Htmlstring)
{
Htmlstring = Regex.Replace(Htmlstring, @" <script[^>]*?>.*?</script> " , "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" <(.[^>]*)> " , "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" ([\r\n])[\s]+ " , "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" --> " , "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" <!--.* " , "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(quot|#34); " , " \ "" , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(amp|#38); " , " & " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(lt|#60); " , " < " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(gt|#62); " , " > " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(nbsp|#160); " , " " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(iexcl|#161); " , " \xa1 " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(cent|#162); " , " \xa2 " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(pound|#163); " , " \xa3 " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &(copy|#169); " , " \xa9 " , RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @" &#(\d+); " , "" , RegexOptions.IgnoreCase);
Htmlstring.Replace( " < " , "" );
Htmlstring.Replace( " > " , "" );
Htmlstring.Replace( " \r\n " , "" );
return Htmlstring.Trim();
}
#endregion
}
}
最后就是调用类了,先看一下软件界面吧:
后台代码:


using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using BlogsConvert;
using System.Net;
using System.IO;
namespace CnBlogsHelper
{
public partial class BlogToPdf : Form
{
public channel commonInfo = new channel();
public IList < item > blogInfos = new List < item > ();
public BlogToPdf()
{
InitializeComponent();
}
private void BlogToPdf_Load( object sender, EventArgs e)
{
}
/// <summary>
/// 获取RSS源码,存入XML文件中
/// </summary>
/// <param name="PageUrl"> XML文件路径 </param>
public void GetXML( string PageUrl)
{
// 发送GET请求,得到XML格式的数据
WebRequest request = WebRequest.Create(PageUrl);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, System.Text.Encoding.GetEncoding( " GB2312 " ));
string Content = sr.ReadToEnd();
string xmlPath = Application.StartupPath + @" \Blogs.xml " ;
// 如果XML文件不存在就创建
if ( ! System.IO.File.Exists(xmlPath))
{
System.IO.FileStream f = System.IO.File.Create(xmlPath);
f.Close();
}
// 以覆盖的形式把数据写入XML文件
System.IO.StreamWriter f2 = new System.IO.StreamWriter(xmlPath, false , System.Text.Encoding.GetEncoding( " UTF-8 " ));
f2.Write(Content);
f2.Close();
f2.Dispose();
sr.Close();
resStream.Close();
if (Content.Trim() == "" )
{
throw new Exception( " 用户名有误,请检查后重新输入! " );
}
}
/// <summary>
/// 生成PDF文件
/// </summary>
/// <param name="saveName"> 生成的PDF文件名 </param>
/// <param name="cha"> 博主信息 </param>
/// <param name="itemList"> 文章信息 </param>
public void CreatePDF( string saveName,channel cha,IList < item > itemList)
{
BlogsInfo blog = new BlogsInfo();
IConvert con = new ToPdf();
string dir = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
con.Convert(cha,itemList,dir + " \\ " + saveName + " .pdf " );
}
// 生成事件
private void btnCreate_Click( object sender, EventArgs e)
{
if ( ! CheckForm())
return ;
try
{
if (blogInfos.Count > 0 )
{
Wait f = new Wait();
f.Show();
Application.DoEvents();
CreatePDF(txtFileName.Text.Trim(), commonInfo, blogInfos);
f.Close();
MessageBox.Show( " PDF文档“ " + txtFileName.Text.Trim() + " .pdf”生成成功,文档在桌面! " );
}
else
{
MessageBox.Show( " 博客数为0,请先提取博客信息! " );
}
}
catch (Exception ex)
{
MessageBox.Show( " 异常信息: " + ex.Message);
}
}
// 提取博客信息事件
private void btnFind_Click( object sender, EventArgs e)
{
if ( ! CheckForm())
return ;
libBlog.Items.Clear();
string pageUrl = txtBlogUrl.Text.Trim();
if (pageUrl.Substring(pageUrl.Length - 1 , 1 ) != " / " )
{
pageUrl = pageUrl + @" / " ;
}
pageUrl = pageUrl + " rss " ;
try
{
// 弹出等待窗体
Wait f = new Wait();
f.Show();
Application.DoEvents();
GetXML(pageUrl);
string path = Application.StartupPath + @" \Blogs.xml " ;
BlogsInfo blogInfo = new BlogsInfo();
commonInfo = blogInfo.GetChannel(path);
blogInfos = blogInfo.GetItems(path, txtKeyWord.Text.Trim() == " 请输入标题中的关键字 " ? "" :txtKeyWord.Text.Trim());
foreach (item o in blogInfos)
{
libBlog.Items.Add(o.Title);
}
f.Close();
}
catch (Exception ex)
{
MessageBox.Show( " 异常信息: " + ex.Message);
}
}
// 清空事件
private void btnClearAll_Click( object sender, EventArgs e)
{
libBlog.Items.Clear();
blogInfos.Clear();
}
// 删除当前选中项事件
private void btnClearCurrent_Click( object sender, EventArgs e)
{
int index = libBlog.SelectedIndex;
libBlog.Items.Remove(libBlog.Items[index]);
blogInfos.RemoveAt(index);
}
// 鼠标进入文本框清空默认文本
private void txtKeyWord_Click( object sender, EventArgs e)
{
txtKeyWord.Text = txtKeyWord.Text.Trim() == " 请输入标题中的关键字 " ? "" :txtKeyWord.Text;
}
private bool CheckForm()
{
if (txtBlogUrl.Text.Trim() == "" || txtFileName.Text.Trim() == "" )
{
MessageBox.Show( " 博客地址和保存文件名不能为空! " );
txtBlogUrl.Text = " http://www.cnblogs.com/ " ;
txtFileName.Text = " 我的博客 " ;
return false ;
}
return true ;
}
}
}
其中调用了一个等待窗体Wait,非常简单,这里就不说了,大家可以看源代码。
博客园中高手如云,本人只能算个菜,只是把自己写的一点小东西拿出来跟大家分享,希望能帮到大家,欢迎各位朋友批评指正,如果使用过程中有错误请留言哦。
本软件目地是服务博客园的朋友们,源代码完全开源,但转载或二次开发请注明出处。