获取网页内容

Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->int FindCodePage(PBYTE p,int nLen,CString theUrl)
{
    int nResult = -1;
    UINT u[4];
    UINT uUTF8Count = 0;
    UINT uACPCount = 0;
    nResult = -1;
    if(nLen < 8)
        return nResult;
    if (p[0] == 0xFF && p[1] == 0xFE && p[2] != 0xFF)//Unicode
    {
        nResult = CP_UTF8;
    }
    else if (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)//UTF8
    {
        nResult = CP_UTF8;
    }
    else
    {
        for(DWORD i=4;i<nLen-4;i++)
        {
            u[0] = p[i];
            u[1] = p[i+1];
            u[2] = p[i+2];
            u[3] = p[i+3];
            if((u[0]&248) ==240)   //& B11111000     must be:B11110XXX
            {   
                if((u[1]&192) == 128
                &&(u[2]&192) == 128
                &&(u[3]&192) == 128)
                {
                    nResult = CP_UTF8;
                    uUTF8Count++;
                    i +=3;
                    //break;
                }
                else
                {
                    nResult = CP_ACP;
                    i ++;
                    uACPCount++;
                    break;
                }
            }
            else if((u[0]&240) ==224)   //& B11110000     must be:B1110XXXX
            {
                
                //if((p[i+1] & 192 ==128)
                 //&&(p[i+2] & 192 ==128))
                if((u[1]&192) == 128
                &&(u[2]&192) == 128)
                {
                    nResult = CP_UTF8;
                    uUTF8Count++;
                    i +=2;
                    //break;
                }
                else// if(u[0]>=128 && u[1] >=128)
                {
                    nResult = CP_ACP;
                    i ++;
                    uACPCount++;
                    break;
                }
            }
            //else if((u[0]&224) ==192)   //& B11100000     must be:B110XXXXX
            //{
            //    if((u[1]&192) == 128)
            //    {
            //        nResult = CP_UTF8;
            //        break;
            //    }
            //}
            /*else if(p[i]>160)
            {
                if((p[i+1]>160))
                {
                    nResult = CP_ACP;
                    break;
                }
            }*/
            
        }
    }
    if(nResult<0)
        nResult = CP_ACP;
    if(uUTF8Count+uACPCount>0)
        TRACE(theUrl+CString("  PageCode = %d  \n"),nResult);
    return nResult;
}
//获取网页内容
CString GetSourceHtml(CString theUrl) 
{
    CString retVal;
    CInternetSession session;
    CInternetFile* file = NULL;
    
    try
    {
        // 试着连接到指定URL
        file = (CInternetFile*) session.OpenURL(theUrl);
    }
    catch (CInternetException* m_pException)
    {
        // 如果有错误的话,置文件为空
        file = NULL; 
        m_pException->Delete();
        return retVal;
    }
    
    if (file)
    {
        DWORD dwFileLen = 2097152;// 2 M
        //BYTE* pBuf =new byte[81920];
        BYTE* pBuf =new byte[dwFileLen];
        
        DWORD dwReadBytes = 0;
        CString  somecode; //也可采用LPTSTR类型,将不会删除文本中的\n回车符
        
        int nCodePage = -1;
        // 读写网页文件,直到为空
        DWORD dwPos = 0;
        while(1)
        {
            dwReadBytes = file->Read(pBuf+dwPos,4096);
            if(dwReadBytes <1)
                break;
            else
                dwPos += dwReadBytes;
        }
        dwReadBytes = dwPos;


        //nCodePage = FindCodePage(pBuf,dwReadBytes);
        nCodePage = FindCodePage(pBuf,dwReadBytes,theUrl);
        
        //预转换,得到所需空间的大小
        int oldLen = retVal.GetLength();
                
        int wcsLen = ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf,dwReadBytes, NULL, 0);
        //分配空间要给'\0'留个空间,MultiByteToWideChar不会给'\0'空间
        wchar_t* wszString = new wchar_t[wcsLen + 1];
        memset(wszString,0,sizeof(wchar_t)*(wcsLen + 1));
        //转换
        ::MultiByteToWideChar(nCodePage, NULL, (LPCSTR)pBuf, dwReadBytes, wszString, wcsLen);
        //最后加上'\0'
        //wszString[wcsLen] = '\0';
        retVal = CString(wszString);
        delete[] wszString;

        file->Close();
        delete file;
        delete pBuf;
    }
    else
    {
        return retVal;
    }

    return retVal;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值