#version: 2010-12-30 #!/usr/local/bin/perl -w use strict; #use cwd; use diagnostics; use LWP::UserAgent; use LWP::Simple; #use Win32; #use Compress::Zlib; #*****************************GLOBAL VARIABLES****************************# my $bDEBUG = 1; my ($TRUE, $FALSE, $SUCCESS, $FAILED) = (1,0,1,0); my $NEWLINE = "/r/n"; #*****************************AUXILIARY FUNCTIONS****************************# sub DEBUG_INFO { return if (!$bDEBUG); if (defined(@_)) { print "@_/n"; } else { print "Not Defined!/n"; } } sub D {DEBUG_INFO(@_);} sub P {print "@_/n";} sub LOG_FILE { my($fileName, $bAppData, @logPara) = @_; #bAppData -- append date to file or overwrite file #DEBUG_INFO($fileName, $bAppData); $fileName =~ s!//!/!ig; my @pathAry = split('/', $fileName); my $tmpPath = ""; for (my $i=0; $i<scalar(@pathAry)-1; $i++) { $tmpPath .= $pathAry[$i] . '/'; #D($tmpPath); mkdir($tmpPath, 0111) if (! -d $tmpPath); } if ($bAppData) {$fileName = " >> " . $fileName; #append data } else {$fileName = " > " . $fileName;} open(tmpLogFile, $fileName) || die "Cannot open log file: $fileName!/n"; foreach (@logPara) {print tmpLogFile "$_/n";} close(tmpLogFile); } sub download_webpage { my ($url, $savedFName) = @_; D("In download_webpage() -- $savedFName/t$url"); my $userAgent = new LWP::UserAgent; $userAgent->agent('Mozilla/5.0'); my $req = HTTP::Request->new('GET', $url); #my $req = new HTTP::Request ('POST',$address); $req->content_type('application/x-www-form-urlencoded'); #$req->content(); my $res = $userAgent->request($req); LOG_FILE($savedFName, $FALSE, $res->as_string()); }#download_webpage sub download_bin { my ($url, $savedFName) = @_; D("In download_bin() -- $savedFName/t$url"); my $outcome = get ($url); open FILE,"> $savedFName" || die "$!"; binmode(FILE); print FILE $outcome; close FILE; } sub send_request { my ($url, $reqStr) = @_; D("In send_request() -- $url/n$reqStr"); my $ua = LWP::UserAgent -> new(); #$ua->agent('Mozilla/5.0'); $ua->agent('Jakarta Commons-HttpClient/3.1'); #request my $req = new HTTP::Request ('POST',$url); #$req->content_type('application/x-www-form-urlencoded'); $req->content_type('text/xml;charset=UTF-8'); $req->content($reqStr); #response my $resp = $ua->request($req); #D($res->as_string()); #D($resp->is_success()); #D($resp->message()); my $respStr = $resp->content(); if ($respStr=~/Error/i) { P("** Send reqeust got ERROR! **/nExiting.../n"); exit 0; } }#send_request sub trim($) { my $string = shift; $string =~ s/^/s+//; $string =~ s//s+$//; return $string; } sub isEmptyStr { my ($result, $str) = (0, @_); $result = 1 if (!defined($str) || $str eq "" || $str=~m/^/s+$/ig); return $result; } ############################################################################### sub main { my ($content, $articleId) = ("", ""); my ($pageNo, $lastPageNo) = (1, 0); my ($url, $url_host, $savedFName) = ("", "http://www.caorenchao.com/", "Temp.htm"); $url = $url_host; $pageNo = 1; do { download_webpage($url, $savedFName); open(hFileHandle, $savedFName) || die "Cannot open file $savedFName!"; while (<hFileHandle>) { die "Fail to download $url!/n" if (/500 /(Internal Server Error/)/); if ($lastPageNo<1 and m!class="pages"!ig) { #D($_); $lastPageNo = (m!class="pages">/d+/(/d+)!i) [0]; D("Last Page is: $lastPageNo"); #exit 1; } next if (not m/<div class="post" id="post-/d+"/i); $articleId = (m/<div class="post" id="post-(/d+)"/i) [0]; D("articleId is $articleId"); my $savedArticle = "./Articles/$articleId.htm"; #last if (-e $savedArticle); next if (-e $savedArticle); #exit 1 if (-e $savedArticle); P("Ready to download article: $articleId.../n"); $content = ""; do { if (m!google!i or m!<mce:script!i or m!//* !i or m!//--><!-- !) { #do nothing } elsif (m! // --></mce:script>!i) { $content .= substr($_, rindex($_, "</script>")+9); } else { $content .= $_; } $_ = <hFileHandle>; if (defined $_ && m/<img /i) { my $imageUrl = (m!<img.+src="(http:///S+)"!ig) [0]; #D("imageUrl is: /t$imageUrl"); my $imageFName = rindex($imageUrl, '/')>0 ? substr($imageUrl, rindex($imageUrl, '/')+1) : $imageUrl; #D("imageFName is: /t$imageFName"); my $imagePath = ""; if ($imageUrl=~m/wp-content/i || $imageFName=~m/author.gif/i || $imageFName=~m/timeicon.gif/i || $imageFName=~m/comments.gif/i) { #do not download the image } else { $imageFName = "$articleId/_$imageFName"; $imagePath = "./Articles/$imageFName"; if (not -e $imagePath) #hemerr { download_bin($imageUrl, $imagePath); } #change the content of image path #D($_); s!$imageUrl!$imageFName!ig; #D($_); } }#need download image #if (defined $_) {chomp; D($_); } } while (defined $_ && not m!/$articleId/.html#comments!ig); #D($content); $content = sprintf("%s/n%s/n%s/n%s/n%s", '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">', '<html xmlns="http://www.w3.org/1999/xhtml">', '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />', $content, "<p><p><p></html>"); unlink $savedArticle if (-e $savedArticle); LOG_FILE($savedArticle, $FALSE, $content); } close(hFileHandle); $url = sprintf("%s/page/%d", $url_host, ++$pageNo); } while ($pageNo <= $lastPageNo); } sub Test02 { print "/@INC is @INC/n"; } sub print_usage { print"/n"; printf("*** Function SELECTOR ***/n"); printf("* 1. TEST01 */n"); printf("* 2. TEST02 */n"); printf("*************************/n"); printf("/nChoose An Option: "); } ############################################################################### if (1) { main(); } else { Test(); }