对libiconv库的使用

/*---------------------------------------------------------------------------------------
*
* 题记  : 创建
* 作者  : Jack.shi(cydana@126.com/QQ:1617402349)
* 函数名  : CWebPageAnalyze::GetArtical
* 功能  : 根据URL获取网页文章的标题和内容,返回形式均为ASCII
* 参数1  : strUrl:(const CStdString&)【IN】->URL地址
* 参数2  : strTitle:(CStdString&)【OUT】->返回ASCII形式的网页文章标题
* 参数2  : strContent:(CStdString&)【OUT】->返回ASCII形式的网页文章内容
* 返回值  : RESULT ->函数执行情况
* 注意  : 见下面返回各种情形
*
------------------------------------------------------------------------------------------*/
// 取得网页主要内容
// 字符编码格式转换失败,返回-1
// 当分析成功时,返回0
// 当下载失败时,返回2
// 当分析失败时,返回3
// 当含禁用关键词时,返回4
HRESULT CWebPageAnalyze::GetArtical(const CStdString& strUrl, CStdString& strTitle, CStdString& strContent)
{
	CStdStringArray		arForbiddenword;
	CStdString			strEncodingType ;
	CStdString			strAfterTransferredToUtf8 ;
	bool				flag = 0 ;

	CStdString			strWebSourceCode;
	CStdString			strContentType ;
	CStdString			strPreLog;
	strPreLog.Format("%s:threadid:%d:", strUrl.c_str(), GetCurrentThreadId());
	g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "开始取网页主要内容");

	if (m_WebPageDownload.GetWebPageContent(strUrl, strContentType, strWebSourceCode) != 0)
	{
		g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "获取网页源代码失败");
		strTitle.clear() ;
		strContent.clear() ;
		return 2;
	}

	strEncodingType	= m_WebPageDownload.strGetEncodingType(strContentType, strWebSourceCode) ;
	if (strEncodingType.empty())
	{
		g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "不能获取到网页的编码类型");
		strTitle.clear() ;
		strContent.clear() ;
		return 3 ;
	}
	if(0 != strEncodingType.compare("utf-8"))
	{// 如果不为utf8则转化成utf8格式
		
		flag = 1 ;
		strAfterTransferredToUtf8.clear() ;
		if(S_FALSE == hrTransferEncoding (strEncodingType.c_str(), "utf-8//IGNORE", strWebSourceCode, strAfterTransferredToUtf8))
		{
			strTitle.clear() ;
			strContent.clear() ;
			return -1 ;
		}
	}

	// 得到网页源代码,开始获取网页主要内容
	g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "得到网页源代码,开始获取网页主要内容");
	HANDLE hAnalyse = CreateAnaly () ;
	if (NULL == hAnalyse)
	{
		g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CreateAnaly创建失败");
		strTitle.clear() ;
		strContent.clear() ;
		if (FALSE == CloseAnaly (hAnalyse))
		{
			g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
		}
		return 3 ;
	}
	if(flag)
	{
		int i = strAfterTransferredToUtf8.size();
		if (S_FALSE == HtmlAnalize (hAnalyse, strAfterTransferredToUtf8.c_str()))
		{
			g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "HtmlAnalize执行失败");
			strTitle.clear() ;
			strContent.clear() ;
			if (FALSE == CloseAnaly (hAnalyse))
			{
				g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
			}
			return 3 ;
		}
	}
	else
	{
		if(S_FALSE == HtmlAnalize (hAnalyse, strWebSourceCode.c_str()))
		{
			g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "HtmlAnalize执行失败");
			strTitle.clear() ;
			strContent.clear() ;
			if (FALSE == CloseAnaly (hAnalyse))
			{
				g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
			}
			return 3 ;
		}
	}


	char* pTitle	= NULL ;
	char* pContent	= NULL ;
	int nTitleLen	= 0 ;
	int nContentLen	= 0 ;
	GetArticalEx (hAnalyse, pTitle, &nTitleLen, pContent, &nContentLen) ;
	pTitle			= new char[nTitleLen + 1] ;
	pContent		= new char[nContentLen + 1] ;
	// 获取源代码的标题和内容
	HRESULT hMyResult = GetArticalEx (hAnalyse, pTitle, &nTitleLen, pContent, &nContentLen) ;
	if (S_FALSE == hMyResult)
	{
		g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "获取源代码时内存分配失败");
		delete []pTitle ;
		delete []pContent ;
		strTitle.clear() ;
		strContent.clear() ;
		if (FALSE == CloseAnaly (hAnalyse))
		{
			g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
		}
		return 3 ;
	}
	pTitle[nTitleLen]		= 0 ;
	pContent[nContentLen]	= 0 ;

	CStdString strTitleTemp ;
	CStdString strContentTemp ;

	strTitle.clear() ;
	strContent.clear() ;

	strTitleTemp.assign(pTitle) ;
	strContentTemp.assign(pContent) ;

	if (FALSE == CloseAnaly (hAnalyse))
	{
		g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "CloseAnaly执行失败");
		delete []pTitle ;
		delete []pContent ;
		strTitle.clear() ;
		strContent.clear() ;
		return 3 ;
	}

	
	// 将UTF8格式的标题转换成Ascii
	if(S_FALSE == hrTransferEncoding ("utf-8", "gb2312//IGNORE", strTitleTemp, strTitle))
	{
		delete []pTitle ;
		delete []pContent ;
		strTitle.clear() ;
		strContent.clear() ;
		return -1 ;
	}

		// 将UTF8格式的内容转换成Ascii
	if(S_FALSE == hrTransferEncoding ("utf-8", "gb2312//IGNORE", strContentTemp, strContent))
	{
		delete []pTitle ;
		delete []pContent ;
		strTitle.clear() ;
		strContent.clear() ;
		return -1 ;
	}


	// 释放掉这两块内存,因为后面将不使用
	delete		[]pTitle ;
	delete		[]pContent ;
	pTitle		= NULL ;
	pContent	= NULL ;

	//判断标题和内容是否存在禁用关键词
	vector<CStdString>::iterator first = arrWebForbiddenWord.begin() ;
	vector<CStdString>::iterator last = arrWebForbiddenWord.end() ;
	while (first != last)
	{
		if (string::npos != strTitle.find(*first))
		{
			strTitle.clear() ;
			g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "网页文件头中含有禁用词");
			strTitle.clear() ;
			strContent.clear() ;
			return 4 ;
		}
		else if (string::npos != strContent.find(*first))
		{
			strContent.clear() ;
			g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "网页文件内容中含有禁用词");
			strTitle.clear() ;
			strContent.clear() ;
			return 4 ;
		}

		++first ;
	}

	ProcessTxt(strTitle);
	ProcessTxt(strContent);
	g_Log.GetLog()->WriteDBGLog(WHOLE_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "标题:" + strTitle);
	g_Log.GetLog()->WriteDBGLog(WHOLE_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "内容:" + strContent);

	g_Log.GetLog()->WriteDBGLog(NORMAL_IMP_RECORD, "CWebPageAnalyze::GetArtical", strPreLog + "得到网页主要内容");

	return 0;
}
 
 

/*--------------------------------------------------------------------------------------- * * 题记  : 创建 * 作者  : Jack.shi(cydana@126.com/QQ:1617402349) * 函数名  : CWebPageAnalyze::TransferEncoding * 功能  : 进行字符串编码格式转换 * 参数1  : from(const char*)【IN】->转换前的编码格式 * 参数2  : to(const char*)【IN】->转换后的编码格式 * 参数3  : strFrom:(const CStdString&)【IN】->待转换的字符串 * 参数4  : strTitle:(CStdString&)【OUT】->转换后返回的字符串 * 返回值  : RESULT ->成功返回S_OK,失败返回S_FALSE * 注意  : 无 * ------------------------------------------------------------------------------------------*/ HRESULT CWebPageAnalyze::hrTransferEncoding(const char* from, const char* to, const CStdString& strFrom, CStdString& strTo) {   int nSourceLenTemp  = strFrom.GetLength() ;   int nStrToLen   = nSourceLenTemp * 3 ;   int nStrLeft   = nStrToLen;   char* strAfterTansfer = new char[nStrToLen + 1] ;   char* strNext   = strAfterTansfer;   CStdString    strPreLog;   strPreLog.Format("threadid:%d:", GetCurrentThreadId());   iconv_t cd = iconv_open(to, from) ;   if ( -1 == (int)cd )   {    g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "字符转换失败,可能有不支持的字符");    delete []strAfterTansfer ;    return S_FALSE;   }   const char* strTemp  = strFrom.c_str() ;   strNext = strAfterTansfer;

  if (-1 == iconv (cd, &strTemp, (size_t*)&nSourceLenTemp, &strNext, (size_t*)&nStrLeft))   {    if (E2BIG == errno)    {     g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "字符编码转换outputBuf空间不够");     delete []strAfterTansfer ;     if(-1 == iconv_close(cd))     {      g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");     }     return S_FALSE;    }    else if (EILSEQ == errno)    {     g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "inputBuf里有非法的字符序列");     delete []strAfterTansfer ;     if(-1 == iconv_close(cd))     {      g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");     }     return S_FALSE;    }    else if (EINVAL == errno)    {     g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "inputBuf字符序列不完整");     delete []strAfterTansfer ;     if(-1 == iconv_close(cd))     {      g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");     }     return S_FALSE;    }

  }

  if(-1 == iconv_close(cd))   {    g_Log.GetLog()->WriteDBGLog(IMP_RECORD, "CWebPageAnalyze::TransferEncoding", strPreLog + "iconv关闭失败");    delete []strAfterTansfer ;    return S_FALSE;//打开失败,可能不支持的字符集   }

  strAfterTansfer[nStrToLen - nStrLeft] = 0 ;   strTo = strAfterTansfer ;   delete []strAfterTansfer;   return S_OK ; }

 

/*-------------------------------------------------------------------------------------- * * 题记  : 创建 * 作者  : Jack.shi(cydana@126.com/QQ:1617402349) * 函数名  : CWebPageDownload::strGetEncodingType * 功能  : 返回文本的编码格式 * 参数1  : strContentType:(const CStdString&)【IN】->网页ContentType,用于分析 * 参数2  : strContent:(const CStdString&)【IN】->网页内容,用于分析 * 返回值  : CStdString ->网页的编码格式 * 注意  : 无 * -----------------------------------------------------------------------------------------*/ CStdString CWebPageDownload::strGetEncodingType(const CStdString& strContentType, const CStdString& strContent) {  CStdString strRegexTempContent ("<[\\s]*meta[\\s\\S]*http-equiv[\\s]*=[\\s]*\"[\\s]*content-type[\\s\\S]*?>") ;  CStdString strRegexRempContentType ("[\\s\\S]*charset[\\s\\S]*") ;  CStdString strMyStdString ;  strGetEncodingTypeValue (strContentType, strRegexRempContentType, strMyStdString) ;  if (!strMyStdString.empty())  {   return strMyStdString ;  }  else  {// 标志位,指示有没有在strContentType中找到编码格式,否则从strContent中找

  strMyStdString.clear() ;   strGetEncodingTypeValue (strContent, strRegexTempContent, strMyStdString) ;  }

 return strMyStdString ; }

/*--------------------------------------------------------------------------------------- * * 题记  : 创建 * 作者  : Jack.shi(cydana@126.com/QQ:1617402349) * 函数名  : CWebPageDownload::strGetEncodingTypeValue * 功能  : 提供对strGetEncodingType的核心服务 * 参数1  : strSource:(const CStdString&)【IN】->源代码,用于分析 * 参数2  : strRegexTemp:(const CStdString&)【IN】->对源代码进行过滤的正则表达式 * 返回值  : CStdString ->网页的编码格式 * 注意  : 无 * ------------------------------------------------------------------------------------------*/ void CWebPageDownload::strGetEncodingTypeValue(const CStdString& strSource, const CStdString& strRegexTemp, CStdString& strOut) {  CStdString strMyStdString ;  strOut.clear();

 regex::match_results results; //定义一个匹配结果变量  regex::rpattern pat(strRegexTemp.c_str(),regex::NOCASE);//定义一个匹配模式变量//  regex::match_results::backref_type br = pat.match(strSource, results);//对strBuffer字符串执行正则匹配

 if(br.matched)  {   CStdString str = br.str();   str.ToLower();

  string::size_type pos1 = str.find("charset") ;   if(string::npos != pos1)   {    string::size_type pos2 = str.find('=', pos1) ;    string::size_type pos3 = str.find('\"', pos1) ;    strMyStdString = str.substr(pos2 + 1, pos3 - pos2 - 1) ;    strMyStdString.TrimLeft() ;    strMyStdString.TrimRight() ;   }   else   {    strMyStdString = "" ;   }  }  else  {   strMyStdString = "" ;  }  strOut = strMyStdString; }

 

 


 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值