清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
# Author : thicket
# Date : 2013/01/31
# WebSite : hi.baidu.com
# 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpress
use LWP::Simple;
use HTML::Parse;
use HTML::Element;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use URI::Escape;
use POSIX;
$website = $ARGV[0];
if(!$website){
print "=== add website ! ===\n";
exit;
}
$website = 'http://hi.baidu.com/'.$website;
my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
$file_name = './baidu'.$format_time.'.xml';
if(open(OF,">$file_name")){
# print OF ("Here is an output line.\n");
$ua = new LWP::UserAgent; # 產生 UserAgent 物件
print OF ("$website\n");
my $pages_totle, $pages_row;
$_ = get "$website?page=1";
($pages_totle, $pages_row) = getPageNum($_);
my $len = ceil($pages_totle/$pages_row);
for ($count = 1; $count <= $len; $count++) {
$url_ind = "$website?page=$count";
print $url_ind."\n";
$request = new HTTP::Request('GET', $url_ind); # 產生 Request 物件
$response = $ua->request($request); # 開始抓取網頁,並將結果傳會 $response
if ($response->is_success) { # 若抓取網頁成功,則印出 HTML 原始碼
$_ = $response->content;
my $rss = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.2/"
>
<channel>';
my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi);
my $words = join('',@words);
$words =~ s/[\\]|#reply//ig;
my @links = getLink($words);
foreach(@links){
$url = $_;
$request = new HTTP::Request('GET', $url); # 產生 Request 物件
$response = $ua->request($request);
if ($response->is_success) {
($date, $title, $tag, $content) = getContent($response->content);
print '===============================';
print $url."\n";
print $date."\n".$title."\n".$tag."\n";
@tag = split(' ',$tag);
$rss = "<item>
<title>$title</title>
<link></link>
<pubDate>Tue, 15 Jan 2013 12:53:41 +0000</pubDate>
<dc:creator>thicket</dc:creator>
<guid isPermaLink=\"false\"></guid>
<description></description>
<content:encoded><![CDATA[$content]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id></wp:post_id>
<wp:post_date>$date</wp:post_date>
<wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>";
$rss .= uri_escape($title);
$rss .= "</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>";
foreach(@tag){
$rss .= "
<category domain=\"post_tag\" nicename=";
$rss .= uri_escape($_);
$rss .= "><![CDATA[$_]]></category>";
}
$rss .= "
<wp:postmeta>
<wp:meta_key>_edit_last</wp:meta_key>
<wp:meta_value><![CDATA[1]]></wp:meta_value>
</wp:postmeta>
</item>";
print OF ("$rss\n");
}else{
print $response->error_as_HTML;
}
}
print OF ("</channel></rss>\n");
} else { # 若抓取網頁不成功,則印出錯誤訊息
print $response->error_as_HTML;
}
}
close(OF);
}else{
print "open file error \n";
exit;
}
##########################################################################################
#获取文章连接
sub getLink{
my @full_url;
$parsed_html = HTML::Parse::parse_html(@_[0]);
for (@{ $parsed_html->extract_links("a") }) {
$link = $_->[0];
$url = new URI::URL $link;
push(@full_url, $url->abs($website));
}
return @full_url;
}
#获取html
sub getContent{
$_ = @_[0];
my @date = ($_ =~ m/<div[^>]+class=content-other-info>\s*(.+?)\s*<\/div>/i);
my $date = join('',@date);
$date =~ s/<[^>]*>//g;
my @title = ($_ =~ m/<h2 class="title content-title">(.+?)<\/h2>/i);
my $title = join('',@title);
my @content = ($_ =~ m/<div id=content[^>]+>(.+?)<\/div>/i);
my $content = join('',@content);
my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)<\/a>/gi);
my $tag = join(' ',@tag);
$tag =~ s/<[^>]*>//g;
return ($date, $title, $tag, $content);
}
#取得页数
sub getPageNum{
$_ = @_[0];
my @pages = ($_ =~ m/allCount.*,/gi);
@pages = (join("",@pages) =~ m/[0-9]+/gi);
$pages_totle = join("\n",@pages);
my @pages = ($_ =~ m/pageSize.*,/gi);
@pages = (join("",@pages) =~ m/[0-9]+/gi);
$pages_row = join("\n",@pages);
return ($pages_totle, $pages_row);
}