- string
GetHtmlByWget(string url) - {
-
//获取待下载网页文件名 -
string fileName = url.substr((int)url.find_last_of("/") + 1); -
if(fileName != "") -
{ -
string strCom = "wget -q "; //wget命令,-q表示不显示下载信息 -
strCom.append(url); -
system(strCom.c_str()); //执行wget -
-
ifstream fin(fileName.c_str()); -
if(!fin) -
{ -
return ""; -
} -
string strHtml = ""; -
char chTemp[1024] = ""; -
//读取网页文件到内存中 -
while(fin.getline(chTemp , 1024)) -
{ -
strHtml.append(string(chTemp)); -
strcpy(chTemp , ""); -
} -
fin.close(); -
strCom = "rm -f "; //删除文件命令,-f表示直接删除不做任何提示 -
strCom.append(fileName); -
system(strCom.c_str()); //删除刚才下载下来的文件 -
return strHtml; //返回网页源码 -
} -
else -
{ -
return ""; -
} - }
//通过Wget来获取网页
string GetHtmlByWget(string url)
{
//获取待下载网页文件名
string fileName = url.substr((int)url.find_last_of("/") + 1);
if(fileName != "")
{
string strCom = "wget -q "; //wget命令,-q表示不显示下载信息
strCom.append(url);
system(strCom.c_str()); //执行wget
ifstream fin(fileName.c_str());
if(!fin)
{
return "";
}
string strHtml = "";
char chTemp[1024] = "";
//读取网页文件到内存中
while(fin.getline(chTemp , 1024))
{
strHtml.append(string(chTemp));
strcpy(chTemp , "");
}
fin.close();
strCom = "rm -f "; //删除文件命令,-f表示直接删除不做任何提示
strCom.append(fileName);
system(strCom.c_str()); //删除刚才下载下来的文件
return strHtml; //返回网页源码
}
else
{
return "";
}
}
第二个是用的socket的来获取源码
C++代码
//通过GET获取网页源码
string GetHtmlByGet(string url)
{
}
- 使用libcurl
-
- #include
<stdio.h> -
#include <string.h> -
#include <curl/curl.h> -
-
#define MAX_BUF 65536 -
-
char wr_buf[MAX_BUF+1]; -
int wr_index; -
-
-
size_t write_data( void *buffer, size_t size, size_t nmemb, void *userp ) -
{ -
int segsize = size * nmemb; -
-
-
if ( wr_index + segsize > MAX_BUF ) { -
*(int *)userp = 1; -
return 0; -
} -
-
-
memcpy( (void *)&wr_buf[wr_index], buffer, (size_t)segsize ); -
-
-
wr_index += segsize; -
-
-
wr_buf[wr_index] = 0; -
-
-
return segsize; -
} -
-
-
-
int main( void ) -
{ -
CURL *curl; -
CURLcode ret; -
int wr_error; -
-
wr_error = 0; -
wr_index = 0; -
-
-
curl = curl_easy_init(); -
if (!curl) { -
printf("couldn't init curl\n"); -
return 0; -
} -
-
-
curl_easy_setopt( curl, CURLOPT_URL, "www.exampledomain.com" ); -
-
-
curl_easy_setopt( curl, CURLOPT_WRITEDATA, (void *)&wr_error ); -
curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, write_data ); -
-
-
ret = curl_easy_perform( curl ); -
-
printf( "ret = %d (write_error = %d)\n", ret, wr_error ); -
-
-
if ( ret == 0 ) printf( "%s\n", wr_buf ); -
-
curl_easy_cleanup( curl ); -
-
return 0; -
}
- #include
-
本文介绍了使用C++进行网页抓取的三种方法:利用Wget工具、通过socket编程以及使用libcurl库。展示了如何通过这些方法获取网页源码,并提供了详细的代码示例。

726

被折叠的 条评论
为什么被折叠?



