/*---------------------------------------------------------------------------------------
*
* 题记 : 创建
* 作者 : Jack.shi(cydana@126.com/QQ:1617402349)
* 函数名 : CWebPageAnalyze::GetArtical
* 功能 : 根据URL获取网页文章的标题和内容,返回形式均为ASCII
* 参数1 : strUrl:(const CStdString&)【IN】->URL地址
* 参数2 : strTitle:(CStdString&)【OUT】->返回ASCII形式的网页文章标题
* 参数2 : strContent:(CStdString&)【OUT】->返回ASCII形式的网页文章内容
* 返回值 : RESULT ->函数执行情况
* 注意 : 见下面返回各种情形
*
------------------------------------------------------------------------------------------*/
// 取得网页主要内容
// 字符编码格式转换失败,返回-1
// 当分析成功时,返回0
// 当下载失败时,返回2
// 当分析失败时,返回3
// 当含禁用关键词时,返回4
HRESULT CWebPageAnalyze::GetArtical(const CStdString& strUrl, CStdString& strTitle, CStdString& strContent)
{
CStdStringArray arForbiddenword;
CStdString strEncodingType ;
CStdString strAfterTransferredToUtf8 ;
bool flag = 0 ;
CStdString strWebSourceCode;
CStdString strContentType ;
CStdString strPreLog;
strPreLog.Format("%s:threadid:%d:", strUrl.c_str(), GetCurrentThreadId());
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "开始取网页主要内容");
if (m_WebPageDownload.GetWebPageContent(strUrl, strContentType, strWebSourceCode) != 0)
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "获取网页源代码失败");
strTitle.clear() ;
strContent.clear() ;
return 2;
}
strEncodingType = m_WebPageDownload.strGetEncodingType(strContentType, strWebSourceCode) ;
if (strEncodingType.empty())
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "不能获取到网页的编码类型");
strTitle.clear() ;
strContent.clear() ;
return 3 ;
}
if(0 != strEncodingType.compare("utf-8"))
{// 如果不为utf8则转化成utf8格式
flag = 1 ;
strAfterTransferredToUtf8.clear() ;
if(S_FALSE == hrTransferEncoding (strEncodingType.c_str(), "utf-8//IGNORE", strWebSourceCode, strAfterTransferredToUtf8))
{
strTitle.clear() ;
strContent.clear() ;
return -1 ;
}
}
// 得到网页源代码,开始获取网页主要内容
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "得到网页源代码,开始获取网页主要内容");
HANDLE hAnalyse = CreateAnaly () ;
if (NULL == hAnalyse)
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CreateAnaly创建失败");
strTitle.clear() ;
strContent.clear() ;
if (FALSE == CloseAnaly (hAnalyse))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
}
return 3 ;
}
if(flag)
{
int i = strAfterTransferredToUtf8.size();
if (S_FALSE == HtmlAnalize (hAnalyse, strAfterTransferredToUtf8.c_str()))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "HtmlAnalize执行失败");
strTitle.clear() ;
strContent.clear() ;
if (FALSE == CloseAnaly (hAnalyse))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
}
return 3 ;
}
}
else
{
if(S_FALSE == HtmlAnalize (hAnalyse, strWebSourceCode.c_str()))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "HtmlAnalize执行失败");
strTitle.clear() ;
strContent.clear() ;
if (FALSE == CloseAnaly (hAnalyse))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
}
return 3 ;
}
}
char* pTitle = NULL ;
char* pContent = NULL ;
int nTitleLen = 0 ;
int nContentLen = 0 ;
GetArticalEx (hAnalyse, pTitle, &nTitleLen, pContent, &nContentLen) ;
pTitle = new char[nTitleLen + 1] ;
pContent = new char[nContentLen + 1] ;
// 获取源代码的标题和内容
HRESULT hMyResult = GetArticalEx (hAnalyse, pTitle, &nTitleLen, pContent, &nContentLen) ;
if (S_FALSE == hMyResult)
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "获取源代码时内存分配失败");
delete []pTitle ;
delete []pContent ;
strTitle.clear() ;
strContent.clear() ;
if (FALSE == CloseAnaly (hAnalyse))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
}
return 3 ;
}
pTitle[nTitleLen] = 0 ;
pContent[nContentLen] = 0 ;
CStdString strTitleTemp ;
CStdString strContentTemp ;
strTitle.clear() ;
strContent.clear() ;
strTitleTemp.assign(pTitle) ;
strContentTemp.assign(pContent) ;
if (FALSE == CloseAnaly (hAnalyse))
{
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
delete []pTitle ;
delete []pContent ;
strTitle.clear() ;
strContent.clear() ;
return 3 ;
}
// 将UTF8格式的标题转换成Ascii
if(S_FALSE == hrTransferEncoding ("utf-8", "gb2312//IGNORE", strTitleTemp, strTitle))
{
delete []pTitle ;
delete []pContent ;
strTitle.clear() ;
strContent.clear() ;
return -1 ;
}
// 将UTF8格式的内容转换成Ascii
if(S_FALSE == hrTransferEncoding ("utf-8", "gb2312//IGNORE", strContentTemp, strContent))
{
delete []pTitle ;
delete []pContent ;
strTitle.clear() ;
strContent.clear() ;
return -1 ;
}
// 释放掉这两块内存,因为后面将不使用
delete []pTitle ;
delete []pContent ;
pTitle = NULL ;
pContent = NULL ;
//判断标题和内容是否存在禁用关键词
vector<CStdString>::iterator first = arrWebForbiddenWord.begin() ;
vector<CStdString>::iterator last = arrWebForbiddenWord.end() ;
while (first != last)
{
if (string::npos != strTitle.find(*first))
{
strTitle.clear() ;
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "网页文件头中含有禁用词");
strTitle.clear() ;
strContent.clear() ;
return 4 ;
}
else if (string::npos != strContent.find(*first))
{
strContent.clear() ;
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "网页文件内容中含有禁用词");
strTitle.clear() ;
strContent.clear() ;
return 4 ;
}
++first ;
}
ProcessTxt(strTitle);
ProcessTxt(strContent);
g_Log.GetLog()->WriteDBGLog(WHOLE_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "标题:" + strTitle);
g_Log.GetLog()->WriteDBGLog(WHOLE_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "内容:" + strContent);
g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "得到网页主要内容");
return 0;
}
/*---------------------------------------------------------------------------------------
*
* 题记 : 创建
* 作者 : Jack.shi(cydana@126.com/QQ:1617402349)
* 函数名 : CWebPageAnalyze::TransferEncoding
* 功能 : 进行字符串编码格式转换
* 参数1 : from(const char*)【IN】->转换前的编码格式
* 参数2 : to(const char*)【IN】->转换后的编码格式
* 参数3 : strFrom:(const CStdString&)【IN】->待转换的字符串
* 参数4 : strTitle:(CStdString&)【OUT】->转换后返回的字符串
* 返回值 : RESULT ->成功返回S_OK,失败返回S_FALSE
* 注意 : 无
*
------------------------------------------------------------------------------------------*/
HRESULT CWebPageAnalyze::hrTransferEncoding(const char* from, const char* to, const CStdString& strFrom, CStdString& strTo)
{
int nSourceLenTemp = strFrom.GetLength() ;
int nStrToLen = nSourceLenTemp * 3 ;
int nStrLeft = nStrToLen;
char* strAfterTansfer = new char[nStrToLen + 1] ;
char* strNext = strAfterTansfer;
CStdString strPreLog;
strPreLog.Format("threadid:%d:", GetCurrentThreadId());
iconv_t cd = iconv_open(to, from) ;
if ( -1 == (int)cd )
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "字符转换失败,可能有不支持的字符");
delete []strAfterTansfer ;
return S_FALSE;
}
const char* strTemp = strFrom.c_str() ;
strNext = strAfterTansfer;
if (-1 == iconv (cd, &strTemp, (size_t*)&nSourceLenTemp, &strNext, (size_t*)&nStrLeft))
{
if (E2BIG == errno)
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "字符编码转换outputBuf空间不够");
delete []strAfterTansfer ;
if(-1 == iconv_close(cd))
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");
}
return S_FALSE;
}
else if (EILSEQ == errno)
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "inputBuf里有非法的字符序列");
delete []strAfterTansfer ;
if(-1 == iconv_close(cd))
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");
}
return S_FALSE;
}
else if (EINVAL == errno)
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "inputBuf字符序列不完整");
delete []strAfterTansfer ;
if(-1 == iconv_close(cd))
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");
}
return S_FALSE;
}
}
if(-1 == iconv_close(cd))
{
g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");
delete []strAfterTansfer ;
return S_FALSE;//打开失败,可能不支持的字符集
}
strAfterTansfer[nStrToLen - nStrLeft] = 0 ;
strTo = strAfterTansfer ;
delete []strAfterTansfer;
return S_OK ;
}
/*--------------------------------------------------------------------------------------
*
* 题记 : 创建
* 作者 : Jack.shi(cydana@126.com/QQ:1617402349)
* 函数名 : CWebPageDownload::strGetEncodingType
* 功能 : 返回文本的编码格式
* 参数1 : strContentType:(const CStdString&)【IN】->网页ContentType,用于分析
* 参数2 : strContent:(const CStdString&)【IN】->网页内容,用于分析
* 返回值 : CStdString ->网页的编码格式
* 注意 : 无
*
-----------------------------------------------------------------------------------------*/
CStdString CWebPageDownload::strGetEncodingType(const CStdString& strContentType, const CStdString& strContent)
{
CStdString strRegexTempContent ("<[\\s]*meta[\\s\\S]*http-equiv[\\s]*=[\\s]*\"[\\s]*content-type[\\s\\S]*?>") ;
CStdString strRegexRempContentType ("[\\s\\S]*charset[\\s\\S]*") ;
CStdString strMyStdString ;
strGetEncodingTypeValue (strContentType, strRegexRempContentType, strMyStdString) ;
if (!strMyStdString.empty())
{
return strMyStdString ;
}
else
{// 标志位,指示有没有在strContentType中找到编码格式,否则从strContent中找
strMyStdString.clear() ;
strGetEncodingTypeValue (strContent, strRegexTempContent, strMyStdString) ;
}
return strMyStdString ;
}
/*---------------------------------------------------------------------------------------
*
* 题记 : 创建
* 作者 : Jack.shi(cydana@126.com/QQ:1617402349)
* 函数名 : CWebPageDownload::strGetEncodingTypeValue
* 功能 : 提供对strGetEncodingType的核心服务
* 参数1 : strSource:(const CStdString&)【IN】->源代码,用于分析
* 参数2 : strRegexTemp:(const CStdString&)【IN】->对源代码进行过滤的正则表达式
* 返回值 : CStdString ->网页的编码格式
* 注意 : 无
*
------------------------------------------------------------------------------------------*/
void CWebPageDownload::strGetEncodingTypeValue(const CStdString& strSource, const CStdString& strRegexTemp, CStdString& strOut)
{
CStdString strMyStdString ;
strOut.clear();
regex::match_results results; //定义一个匹配结果变量
regex::rpattern pat(strRegexTemp.c_str(),regex::NOCASE);//定义一个匹配模式变量//
regex::match_results::backref_type br = pat.match(strSource, results);//对strBuffer字符串执行正则匹配
if(br.matched)
{
CStdString str = br.str();
str.ToLower();
string::size_type pos1 = str.find("charset") ;
if(string::npos != pos1)
{
string::size_type pos2 = str.find('=', pos1) ;
string::size_type pos3 = str.find('\"', pos1) ;
strMyStdString = str.substr(pos2 + 1, pos3 - pos2 - 1) ;
strMyStdString.TrimLeft() ;
strMyStdString.TrimRight() ;
}
else
{
strMyStdString = "" ;
}
}
else
{
strMyStdString = "" ;
}
strOut = strMyStdString;
}