清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
# Author : thicket # Date : 2013/01/31 # WebSite : hi.baidu.com # 在当前文件夹生成以日期为文件名的xml文件,可以导入wordpress use LWP::Simple; use HTML::Parse; use HTML::Element; use URI::URL; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use URI::Escape; use POSIX; $website = $ARGV[0]; if(!$website){ print "=== add website ! ===\n"; exit; } $website = 'http://hi.baidu.com/'.$website; my($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time()); my $format_time = sprintf("%d-%d-%d",$year+1900,$mon+1,$mday,$hour,$min,$sec); $file_name = './baidu'.$format_time.'.xml'; if(open(OF,">$file_name")){ # print OF ("Here is an output line.\n"); $ua = new LWP::UserAgent; # 產生 UserAgent 物件 print OF ("$website\n"); my $pages_totle, $pages_row; $_ = get "$website?page=1"; ($pages_totle, $pages_row) = getPageNum($_); my $len = ceil($pages_totle/$pages_row); for ($count = 1; $count <= $len; $count++) { $url_ind = "$website?page=$count"; print $url_ind."\n"; $request = new HTTP::Request('GET', $url_ind); # 產生 Request 物件 $response = $ua->request($request); # 開始抓取網頁,並將結果傳會 $response if ($response->is_success) { # 若抓取網頁成功,則印出 HTML 原始碼 $_ = $response->content; my $rss = '<?xml version="1.0" encoding="UTF-8" ?> <rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:wfw="http://wellformedweb.org/CommentAPI/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wp="http://wordpress.org/export/1.2/" > <channel>'; my @words = ($_ =~ m/(<a[^>]+?#reply[^>]+?>)/gi); my $words = join('',@words); $words =~ s/[\\]|#reply//ig; my @links = getLink($words); foreach(@links){ $url = $_; $request = new HTTP::Request('GET', $url); # 產生 Request 物件 $response = $ua->request($request); if ($response->is_success) { ($date, $title, $tag, $content) = getContent($response->content); print '==============================='; print $url."\n"; print $date."\n".$title."\n".$tag."\n"; @tag = split(' ',$tag); $rss = "<item> <title>$title</title> <link></link> <pubDate>Tue, 15 Jan 2013 12:53:41 +0000</pubDate> <dc:creator>thicket</dc:creator> <guid isPermaLink=\"false\"></guid> <description></description> <content:encoded><![CDATA[$content]]></content:encoded> <excerpt:encoded><![CDATA[]]></excerpt:encoded> <wp:post_id></wp:post_id> <wp:post_date>$date</wp:post_date> <wp:post_date_gmt>2013-01-15 12:53:41</wp:post_date_gmt> <wp:comment_status>open</wp:comment_status> <wp:ping_status>open</wp:ping_status> <wp:post_name>"; $rss .= uri_escape($title); $rss .= "</wp:post_name> <wp:status>publish</wp:status> <wp:post_parent>0</wp:post_parent> <wp:menu_order>0</wp:menu_order> <wp:post_type>post</wp:post_type> <wp:post_password></wp:post_password> <wp:is_sticky>0</wp:is_sticky>"; foreach(@tag){ $rss .= " <category domain=\"post_tag\" nicename="; $rss .= uri_escape($_); $rss .= "><![CDATA[$_]]></category>"; } $rss .= " <wp:postmeta> <wp:meta_key>_edit_last</wp:meta_key> <wp:meta_value><![CDATA[1]]></wp:meta_value> </wp:postmeta> </item>"; print OF ("$rss\n"); }else{ print $response->error_as_HTML; } } print OF ("</channel></rss>\n"); } else { # 若抓取網頁不成功,則印出錯誤訊息 print $response->error_as_HTML; } } close(OF); }else{ print "open file error \n"; exit; } ########################################################################################## #获取文章连接 sub getLink{ my @full_url; $parsed_html = HTML::Parse::parse_html(@_[0]); for (@{ $parsed_html->extract_links("a") }) { $link = $_->[0]; $url = new URI::URL $link; push(@full_url, $url->abs($website)); } return @full_url; } #获取html sub getContent{ $_ = @_[0]; my @date = ($_ =~ m/<div[^>]+class=content-other-info>\s*(.+?)\s*<\/div>/i); my $date = join('',@date); $date =~ s/<[^>]*>//g; my @title = ($_ =~ m/<h2 class="title content-title">(.+?)<\/h2>/i); my $title = join('',@title); my @content = ($_ =~ m/<div id=content[^>]+>(.+?)<\/div>/i); my $content = join('',@content); my @tag = ($_ =~ m/<a class="tag"[^>]+>#(.+?)<\/a>/gi); my $tag = join(' ',@tag); $tag =~ s/<[^>]*>//g; return ($date, $title, $tag, $content); } #取得页数 sub getPageNum{ $_ = @_[0]; my @pages = ($_ =~ m/allCount.*,/gi); @pages = (join("",@pages) =~ m/[0-9]+/gi); $pages_totle = join("\n",@pages); my @pages = ($_ =~ m/pageSize.*,/gi); @pages = (join("",@pages) =~ m/[0-9]+/gi); $pages_row = join("\n",@pages); return ($pages_totle, $pages_row); }