Perl用Web::Scraper的Xpath方法采集数据放入MySQL

采集新浪一小块新闻的标题.url.内容,并存入MySQL数据库,多谢一位兄弟的帮忙才得以完成.速度还不是很理想,下一步看看能不能用上多线程.一步步提高自己的Perl水平.

#!/usr/bin/perl
use warnings;
use URI;
use Web::Scraper;
use utf8;
binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
binmode(STDERR, ':encoding(utf8)');
use DBI;
use strict;
my $user = "root";
my $passwd = "123456";
my $data_source = "dbi:mysql:tes20";
my $dbh = DBI->connect($data_source,$user,$passwd, {RaiseError => 1})
                or die "Can't connect to $data_source: $DBI::errstr";
 
$dbh->do("set names utf8");
my $url = "http://news.sina.com.cn";
 
my $proce = scraper {
 process '//ul[@class="c_l14_01"]/following-sibling::ul[1]/li[1]/a', "news[]" => "TEXT";
 process '//ul[@class="c_l14_01"]/following-sibling::ul[1]/li[1]/a', "links[]" => '@href';
};
my  $res = $proce->scrape( URI->new($url));
 
my $sth = $dbh->prepare("INSERT into blog (title,content,url) values (?, ?, ?)");
for my $i (0..$#{$res->{links}}) {
 my $proce2 = scraper {
 process '//div[@id="artibody"]', "content[]" => 'TEXT';
 };
 my $res2 = $proce2->scrape(URI->new($res->{links}->[$i]));
 my $news = $dbh->quote($res->{news}->[$i]);
 my $content = $dbh->quote($res2->{content}->[0]);
 my $links = $dbh->quote($res->{links}->[$i]);
 my $tuples = $sth->execute($news,$content,$links);
 if ($tuples) {
 print "Successfully inserted $tuples records\n";
 
 }
}
 
 $dbh->disconnect;

Post a Comment