selenium 实现爬虫

本文介绍如何使用Selenium进行网页自动化操作,包括配置环境、启动浏览器、执行搜索等基本任务,并提供了一号店商品爬取的具体代码实现。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1下载

selenium-server-standalone-2.41.0.jar

chromedriver_win32.zip

 IEDriverServer_x64_2.42.0.zip

    

2设置环境

1)解压chromedriver_win32.zip,把chromedriver.exe拷贝至C:/ selenium/chrome/

2)解压IEDriverServer_x64_2.42.0.zip把IEDriverServer.exe拷贝至C:/ selenium/ie/

3)ie驱动的路径加入至环境变量PATH

 

3 代码示例

 建java工程,添加jar包selenium-server-standalone-2.41.0.jar。

3.1 baidu

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
    private static void example() 
    {
    	//for firefox
        //WebDriver driver = new FirefoxDriver();

    	//for chrome 
    	System.setProperty("webdriver.chrome.driver", "C:/ selenium/chrome/chromedriver.exe");
    	WebDriver driver = new ChromeDriver();
    	
    	//for IE
    	//WebDriver driver = new InternetExplorerDriver();

    	WebDriverWait w = new WebDriverWait(driver, 10);
    	
        driver.get("http://www.baidu.com/");  
        // 等价于    driver.navigate().to("http://www.baidu.com/");

        w.until(ExpectedConditions.visibilityOfElementLocated(By.id("kw1")));
        w.until(ExpectedConditions.elementToBeClickable(By.id("su1")));
        
        // Find the text input element by its name
        WebElement element = driver.findElement(By.id("kw1"));
        element.sendKeys("liaoxiangui");

        element.submit();  
       //WebDriver会自动的找element对应的form,并提交. 等价于:   
       //driver.findElement(By.id("su1")).click();

        System.out.println("Page title is: " + driver.getTitle());
        
        // baidu's search is rendered dynamically with JavaScript.
        // Wait for the page to load, timeout after 10 seconds
        w.until(new ExpectedCondition<Boolean>() {public Boolean apply(WebDriver d) {return d.getTitle().toLowerCase().startsWith("liaoxiangui");}});
        
        System.out.println("Page title is: " + driver.getTitle());
        
        try{Thread.sleep(10000);}catch(Exception e){}
        
        //Close the browser
        driver.quit();
    }

3.2 一号店

private static void yihaodian()
{
	System.setProperty("webdriver.chrome.driver", "C:/work/research/bijia/selenium/chromedriver_win32/chromedriver.exe");
	
	ChromeOptions options = new ChromeOptions();
	//options.addArguments("--disable-images");
	WebDriver driver = new ChromeDriver(options);
    
	try
	{
	driver.get("http://www.yhd.com/ctg/s2/c21289-0-60761/b/a-s1-v0-p15-price-d0-f0-m1-rt0-pid-mid0-k/#page=1&sort=1");
        
	Boolean first = true;
        int products=0;
        WebDriverWait ww = new WebDriverWait(driver, 10);
        
        while(true)
        {	
            System.out.println("processing filter page:"+driver.getCurrentUrl());
        	
            if(first)
            {
            	first=false;
	            ww.until(ExpectedConditions.presenceOfElementLocated(By.id("startShopping")));		            

	            WebElement s = driver.findElement(By.id("selectProvince"));
	            s.click();
	            s = driver.findElement(By.id("p_13"));
	            s.click();
	            s = driver.findElement(By.id("startShopping"));
	            s.click();
            }
            
            ww.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector("#itemSearchList > li")));
             //窗口最大化。 必须使得那个按钮在屏幕上可见,否则会抛异常?
            //driver.manage().window().maximize();
            final int liCounter = driver.findElements(By.cssSelector("#itemSearchList > li")).size();		        
	        for(int kk=0;kk<liCounter;kk++)
	        {	
	        	try
	        	{
		        	String cssStr = "#itemSearchList > li:nth-of-type("+(kk+1)+")";
		        	WebElement li = driver.findElement(By.cssSelector(cssStr));;
		        	products++;
		        	
		        	String t;

		        	try{
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".owner > a"))));
		        	t = li.findElement(By.cssSelector(".owner > a")).getAttribute("title");
		        	System.out.println("vendor name="+t);
		        	}
		        	catch(NoSuchElementException e)
		        	{
		        		System.out.println("vendor=自营");
		        	}
		        	
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("electrical_item_box"))));
		        	t = li.findElement(By.className("electrical_item_box")).getAttribute("comproid");
		        	System.out.println("selfid="+t);
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector("div > .search_prod_img > img"))));
		        	t = li.findElement(By.cssSelector("div > .search_prod_img > img")).getAttribute("src");
		        	System.out.println("pic url="+t);
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".title > .title"))));
		        	t = li.findElement(By.cssSelector(".title > .title")).getText();
		        	System.out.println("title="+t);
		        	
		        	t = li.findElement(By.cssSelector(".title > .title")).getAttribute("href");
		        	System.out.println("detailed url="+t);

		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("color_red"))));
		        	t = li.findElement(By.className("color_red")).getText();
		        	System.out.println("price="+t);
	        	}
	        	catch(StaleElementReferenceException ex) //see http://docs.seleniumhq.org/exceptions/stale_element_reference.jsp
	        	{
	        		kk--;
	        		System.out.println("stale element. retry to get it.");
	        	}
	        	//break;
	        }
	        
	        
	        WebElement ne = null;
	        while(true)
	        {
	        	try
	        	{
			        ww.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#rankOpDiv")));
			        //如果找不到指定的element,selenium的做法是抛异常。
		        	ne = driver.findElement(By.cssSelector("#rankOpDiv .select_page_btn a.next")); //throw NoSuchElementException
		        	ww.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#rankOpDiv a.next")));
		        	ne.click();
	        	}
	        	catch(StaleElementReferenceException ex)
	        	{
	        		System.out.println("retry  going to next page.");
	        		continue;
	        	}
	        	catch(NoSuchElementException e)
	        	{
	        		System.out.println("this category end!");
	        	}
	        	break;
	        }
	        
        	
	        if(ne == null)
	        	break;
        }
        
        System.out.println("get "+products+" products.");
	}
	catch(Exception ex)
	{
		System.out.println(ex);
	}
    
    //Close the browser
    driver.quit();
}





### 使用Python和Selenium编写网络爬虫程序 #### 初始化环境配置 为了启动基于Python和Selenium的网页自动化抓取项目,需先完成必要的软件包安装以及WebDriver设置。对于不同类型的浏览器有不同的WebDriver需求;例如,默认情况下selenium3支持Firefox并需要额外安装geckodriver[^3]。 对于Chrome浏览器,则要下载对应的chromedriver,并指定其路径来初始化`webdriver.Chrome()`实例[^4]。 ```python from selenium import webdriver path = 'C:\\Path\\To\\Your\\chromedriver.exe' # 替换为实际存储位置 browser = webdriver.Chrome(executable_path=path) ``` #### 获取目标页面内容 利用上述创建好的浏览器对象访问特定URL地址,可以加载想要抓取信息的目标网页: ```python target_url = "http://example.com" browser.get(target_url) # 输出整个HTML文档源码用于调试或解析 print(browser.page_source) ``` #### 进行元素定位与交互操作 借助于XPath、CSS选择器等方式精确定位页面上的各种组件(如按钮、输入框),进而执行点击、填写表单等动作。这里展示了一个简单的例子——向搜索栏填入关键词并触发查询命令: ```python search_box = browser.find_element_by_name('q') # 假设搜索引擎有一个名为'q'的input标签作为搜索框 search_box.send_keys("Web Scraping with Selenium") submit_button = search_box.submit() ``` > **注意**: `find_element_by_*`方法已被弃用,在较新的版本中应改用`find_element(By.*, "...")`形式替代。 #### 数据收集处理 当完成了所有预期的操作之后,可以从DOM树结构里抽取感兴趣的信息片段。这通常涉及到遍历节点集合、读取属性值或是提取纯文本字符串等内容。下面这段代码示范了怎样获取一组链接列表: ```python links = [] for link in browser.find_elements_by_tag_name('a'): links.append(link.get_attribute('href')) # 或者更简洁的方式使用list comprehension表达式 links = [link.get_attribute('href') for link in browser.find_elements_by_tag_name('a')] ``` 最后记得释放资源,即关闭打开过的窗口/选项卡乃至完全退出会话过程: ```python browser.quit() ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值