用LoadRunner做一个网络爬虫

最新推荐文章于 2021-05-17 07:56:38 发布

最新推荐文章于 2021-05-17 07:56:38 发布 · 106 阅读

文章标签：

#爬虫 #php

文章详细介绍了使用LoadRunner构建简单网络爬虫的方法，并通过改进脚本实现了递归访问URL的功能。重点阐述了如何利用LoadRunner进行缓存暖身测试，并对URL处理逻辑进行了优化。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Kim在《LoadRunner as aWebCrawler》这篇文章中介绍了如何用LoadRunner实现一个简单的网络爬虫：

http://ptfrontline.wordpress.com/2008/04/07/loadrunner-as-a-webcrawler/

网络爬虫在性能测试中可以用在给“缓存暖身”上。

void Process_Level1()

 int i;

 char buf[2048];

 char buf2[2048];

 char *pos;

 int res;

 int count;

 count = atoi(lr_eval_string("{URL_LIST1_count}"));

 if (count > 0)

 for ( i=1; i 0) res++;

 if (res == 0)

 lr_save_string( lr_eval_string(buf), "URL" );

 // Replace & with & - NONSTANDARD FUNCTION

 lr_replace( "URL", "&", "&" );

 web_reg_save_param("URL_LIST2", // save all href="" URL's

  "LB=href="",

  "RB="",

  "Ord=All",

  "Search=Body",

  "NotFound=Warning",

  LAST );

 web_url("URL",

 "URL={BaseURL}{URL}",

 "TargetFrame=",

 "Resource=0",

  "RecContentType=text/html",

 "Mode=HTML",

 LAST);

 // Process all "URL_LIST2" entires

 Process_Level2();

Vince Lozada把这个脚本完善了一下（用递归的方式访问每一个URL一次）：

char **myList;

int numListElements = 0;

int listSize = 1;

Action()

{

web_reg_save_param("URL_LIST1",

"LB=href=/"",

"RB=/"",

"Ord=All",

"Search=Body",

"NotFound=Warning",

LAST );

web_url("Home Page",

"URL={BaseURL}",

"TargetFrame=",

"Resource=0",

"RecContentType=text/html",

"Referer=",

"Snapshot=t1.inf",

"Mode=HTML",

LAST);

Process_URLs(1);

free(myList);

myList = 0;

numListElements = 0;

listSize = 1;

return 0;

}

Process_URLs(int index)

{

int i;

int nextIndex;

char listName[255];

char listCountParamName[255];

char listItemParamName[255];

int count;

int res_count;

char *resourceName;

nextIndex = (index + 1);

sprintf(listCountParamName, "{URL_LIST%d_count}", index);

count = atoi(lr_eval_string(listCountParamName));

if (count > 0){

for (i = 1; i <= count; i++){

sprintf(listItemParamName, "{URL_LIST%d_%d}", index, i);

lr_save_string(lr_eval_string(listItemParamName), "URL");

if (isItemInList(lr_eval_string("{URL}")) == 0) {

char *str = (char *)malloc(sizeof(lr_eval_string("{URL}")));

str = lr_eval_string("{URL}");

addItemToList(str);

sprintf(listName, "URL_LIST%d", nextIndex);

web_reg_save_param(listName,

"LB=href=/"",

"RB=/"",

"Ord=All",

"Search=Body",

"NotFound=Warning",

LAST );

resourceName = (char *) strrchr(lr_eval_string("{URL}"), ‘/’);

web_url(resourceName,

"URL={BaseURL}{URL}",

"TargetFrame=",

"Resource=0",

"RecContentType=text/html",

"Mode=HTML",

LAST);

Process_URLs(nextIndex);

}

void addItemToList(char *item) {

char **newList;

int i;

if (!myList) {

myList = (char **) malloc(listSize * sizeof(char *));

}

if (++numListElements > listSize) {

newList = (char**) malloc(listSize * 2 * sizeof(char *));

for (i = 0; i < listSize; ++i) {

newList[i] = myList[i];

}

listSize *= 2;

free(myList);

myList = newList;

}

myList[numListElements - 1] = item;

}

int isItemInList(char *item) {

int i;

for (i = 0; i < numListElements; ++i) {

if (!strcmp(item, myList[i])) {

return 1;

}

return 0;

}

void printList() {

int i;

for (i = 0; i < numListElements; ++i) {

lr_output_message(myList[i]);

}

试了一下这个脚本，发现还不够完善，在处理链接的URL字符串时还要考虑得更周全。