帮别人写了个爬区网页上部分数据的小程序,最初采用方式一,每次都能准确读取数据,但是需要手动输入cookie,略麻烦。
后来采用二,直接执行jar程序即可,弊端就是有时候识别验证码会不准确,需要多执行次才能正常获取数据。
一、手动登录系统,将cookie粘贴到配置文件中,然后通过读取配置文件中的cookie模拟登陆
private static WebClient getWebClient(){
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45);
webClient.addRequestHeader("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
webClient.addRequestHeader("Connection", "keep-alive");
/*CookieManager cm = new CookieManager();
cm.setCookiesEnabled(true);
String aspCookie = ReadConf.readProperties(Constants.ASP_COOKIE);
String ssoCookie = ReadConf.readProperties(Constants.SSO_COOKIE);
String psdCookie = ReadConf.readProperties(Constants.PSD_COOKIE);
cm.addCookie(new Cookie(Constants.DOMAIN, Constants.ASP_COOKIE, aspCookie));
cm.addCookie(new Cookie(Constants.DOMAIN, Constants.SSO_COOKIE, ssoCookie));
cm.addCookie(new Cookie(Constants.DOMAIN, Constants.PSD_COOKIE, psdCookie));
webClient.setCookieManager(cm);*/
webClient.addRequestHeader("Host",Constants.DOMAIN);
//webClient.addRequestHeader("Referer", Constants.START_URL);
webClient.addRequestHeader("Upgrade-Insecure-Requests","1");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0");
//webclient参数载体
WebClientOptions clientOptions = webClient.getOptions();
// 设置webClient的相关参数
clientOptions.setJavaScriptEnabled(false);
clientOptions.setCssEnabled(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
clientOptions.setTimeout(35000);
clientOptions.setThrowExceptionOnScriptError(false);
return webClient;
}
二、识别验证码,模拟登陆
//将账号密码和验证码模拟输入文字框中,并提交
public static HtmlPage loadSystemAuto(){
HtmlPage loadPage = null;Connection con = Jsoup.connect(Constants.LOGIN_URL);
con.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0");
HtmlPage page = null;
try {
WebClient webClient = new WebClient();
page = webClient.getPage(Constants.LOGIN_URL);
System.out.println(page.asText());
HtmlInput user = page.getHtmlElementById("txtUserName");
user.setAttribute("value", Constants.USERNAEM);
HtmlInput pw = page.getHtmlElementById("txtPassword");
pw.setAttribute("value", Constants.PASSWORD);
HtmlInput code = page.getHtmlElementById("txtCode");
List<DomElement> images = page.getElementsByTagName("img");
HtmlImage image = null;
if(images!=null&&images.size()>0){
for(DomElement d:images){
if(d.getAttribute("alt")!=null&&d.getAttribute("alt").equals("请输入验证码")){
image = (HtmlImage) d;
}
}
}
ImageReader reader = image.getImageReader();
BufferedImage bufferedImage = reader.read(0);
File file = new File("F:\\test.gif");
ImageIO.write(bufferedImage, "gif", file);
String codeStr = ReadImg.readImage(file);
System.out.println(codeStr);
code.setAttribute("value", codeStr);
List<HtmlForm> forms = page.getForms();
System.out.println(forms.size());
HtmlForm form = forms.get(0);
HtmlInput submitInput = form.getInputByName("btnOk");
loadPage = submitInput.click();
System.out.println(loadPage.asText());
} catch (FailingHttpStatusCodeException | IOException e) {
e.printStackTrace();
}
return loadPage;
}
//利用java-ocr-api.jar识别验证码
public class ReadImg {
public static String readImage(File filepicF){
String s = null;
Ocr.setUp(); // one time setup
Ocr ocr = new Ocr(); // create a new OCR engine
ocr.startEngine("eng", Ocr.SPEED_FASTEST); // English
s = ocr.recognize(new File[]{filepicF}, Ocr.RECOGNIZE_TYPE_TEXT, Ocr.OUTPUT_FORMAT_PLAINTEXT);
System.out.println("Result: " + s);
System.out.println("图片文字为:" + s.replace(",", "").replace("i", "1").replace(" ", "").replace("'", "").replace("o", "0").replace("O", "0").replace("g", "6").replace("B", "8").replace("s", "5").replace("z", "2"));
ocr.stopEngine();
return s;
}
}