selenium 实现爬虫

最新推荐文章于 2025-06-28 12:10:01 发布

小江_xiaojiang

最新推荐文章于 2025-06-28 12:10:01 发布

阅读量2k

点赞数 1

分类专栏：网络爬虫

网络爬虫专栏收录该内容

12 篇文章

订阅专栏

本文介绍如何使用Selenium进行网页自动化操作，包括配置环境、启动浏览器、执行搜索等基本任务，并提供了一号店商品爬取的具体代码实现。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1下载

selenium-server-standalone-2.41.0.jar

chromedriver_win32.zip

IEDriverServer_x64_2.42.0.zip

2设置环境

1)解压chromedriver_win32.zip，把chromedriver.exe拷贝至C:/ selenium/chrome/

2)解压IEDriverServer_x64_2.42.0.zip把IEDriverServer.exe拷贝至C:/ selenium/ie/

3)ie驱动的路径加入至环境变量PATH

3 代码示例

建java工程，添加jar包selenium-server-standalone-2.41.0.jar。

3.1 baidu

import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.StaleElementReferenceException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
    private static void example() 
    {
    	//for firefox
        //WebDriver driver = new FirefoxDriver();

    	//for chrome 
    	System.setProperty("webdriver.chrome.driver", "C:/ selenium/chrome/chromedriver.exe");
    	WebDriver driver = new ChromeDriver();
    	
    	//for IE
    	//WebDriver driver = new InternetExplorerDriver();

    	WebDriverWait w = new WebDriverWait(driver, 10);
    	
        driver.get("http://www.baidu.com/");  
        // 等价于    driver.navigate().to("http://www.baidu.com/");

        w.until(ExpectedConditions.visibilityOfElementLocated(By.id("kw1")));
        w.until(ExpectedConditions.elementToBeClickable(By.id("su1")));
        
        // Find the text input element by its name
        WebElement element = driver.findElement(By.id("kw1"));
        element.sendKeys("liaoxiangui");

        element.submit();  
       //WebDriver会自动的找element对应的form，并提交. 等价于：   
       //driver.findElement(By.id("su1")).click();

        System.out.println("Page title is: " + driver.getTitle());
        
        // baidu's search is rendered dynamically with JavaScript.
        // Wait for the page to load, timeout after 10 seconds
        w.until(new ExpectedCondition<Boolean>() {public Boolean apply(WebDriver d) {return d.getTitle().toLowerCase().startsWith("liaoxiangui");}});
        
        System.out.println("Page title is: " + driver.getTitle());
        
        try{Thread.sleep(10000);}catch(Exception e){}
        
        //Close the browser
        driver.quit();
    }

3.2 一号店

private static void yihaodian()
{
	System.setProperty("webdriver.chrome.driver", "C:/work/research/bijia/selenium/chromedriver_win32/chromedriver.exe");
	
	ChromeOptions options = new ChromeOptions();
	//options.addArguments("--disable-images");
	WebDriver driver = new ChromeDriver(options);
    
	try
	{
	driver.get("http://www.yhd.com/ctg/s2/c21289-0-60761/b/a-s1-v0-p15-price-d0-f0-m1-rt0-pid-mid0-k/#page=1&sort=1");
        
	Boolean first = true;
        int products=0;
        WebDriverWait ww = new WebDriverWait(driver, 10);
        
        while(true)
        {	
            System.out.println("processing filter page:"+driver.getCurrentUrl());
        	
            if(first)
            {
            	first=false;
	            ww.until(ExpectedConditions.presenceOfElementLocated(By.id("startShopping")));		            

	            WebElement s = driver.findElement(By.id("selectProvince"));
	            s.click();
	            s = driver.findElement(By.id("p_13"));
	            s.click();
	            s = driver.findElement(By.id("startShopping"));
	            s.click();
            }
            
            ww.until(ExpectedConditions.presenceOfAllElementsLocatedBy(By.cssSelector("#itemSearchList > li")));
             //窗口最大化。 必须使得那个按钮在屏幕上可见，否则会抛异常?
            //driver.manage().window().maximize();
            final int liCounter = driver.findElements(By.cssSelector("#itemSearchList > li")).size();		        
	        for(int kk=0;kk<liCounter;kk++)
	        {	
	        	try
	        	{
		        	String cssStr = "#itemSearchList > li:nth-of-type("+(kk+1)+")";
		        	WebElement li = driver.findElement(By.cssSelector(cssStr));;
		        	products++;
		        	
		        	String t;

		        	try{
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".owner > a"))));
		        	t = li.findElement(By.cssSelector(".owner > a")).getAttribute("title");
		        	System.out.println("vendor name="+t);
		        	}
		        	catch(NoSuchElementException e)
		        	{
		        		System.out.println("vendor=自营");
		        	}
		        	
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("electrical_item_box"))));
		        	t = li.findElement(By.className("electrical_item_box")).getAttribute("comproid");
		        	System.out.println("selfid="+t);
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector("div > .search_prod_img > img"))));
		        	t = li.findElement(By.cssSelector("div > .search_prod_img > img")).getAttribute("src");
		        	System.out.println("pic url="+t);
		        	
		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.cssSelector(".title > .title"))));
		        	t = li.findElement(By.cssSelector(".title > .title")).getText();
		        	System.out.println("title="+t);
		        	
		        	t = li.findElement(By.cssSelector(".title > .title")).getAttribute("href");
		        	System.out.println("detailed url="+t);

		        	ww.until(ExpectedConditions.visibilityOf(li.findElement(By.className("color_red"))));
		        	t = li.findElement(By.className("color_red")).getText();
		        	System.out.println("price="+t);
	        	}
	        	catch(StaleElementReferenceException ex) //see http://docs.seleniumhq.org/exceptions/stale_element_reference.jsp
	        	{
	        		kk--;
	        		System.out.println("stale element. retry to get it.");
	        	}
	        	//break;
	        }
	        
	        
	        WebElement ne = null;
	        while(true)
	        {
	        	try
	        	{
			        ww.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#rankOpDiv")));
			        //如果找不到指定的element，selenium的做法是抛异常。
		        	ne = driver.findElement(By.cssSelector("#rankOpDiv .select_page_btn a.next")); //throw NoSuchElementException
		        	ww.until(ExpectedConditions.elementToBeClickable(By.cssSelector("#rankOpDiv a.next")));
		        	ne.click();
	        	}
	        	catch(StaleElementReferenceException ex)
	        	{
	        		System.out.println("retry  going to next page.");
	        		continue;
	        	}
	        	catch(NoSuchElementException e)
	        	{
	        		System.out.println("this category end!");
	        	}
	        	break;
	        }
	        
        	
	        if(ne == null)
	        	break;
        }
        
        System.out.println("get "+products+" products.");
	}
	catch(Exception ex)
	{
		System.out.println(ex);
	}
    
    //Close the browser
    driver.quit();
}