采集新浪一小块新闻的标题.url.内容,并存入MySQL数据库,多谢一位兄弟的帮忙才得以完成.速度还不是很理想,下一步看看能不能用上多线程.一步步提高自己的Perl水平.
#!/usr/bin/perl use warnings; use URI; use Web::Scraper; use utf8; binmode(STDIN, ':encoding(utf8)'); binmode(STDOUT, ':encoding(utf8)'); binmode(STDERR, ':encoding(utf8)'); use DBI; use strict; my $user = "root"; my $passwd = "123456"; my $data_source = "dbi:mysql:tes20"; my $dbh = DBI->connect($data_source,$user,$passwd, {RaiseError => 1}) or die "Can't connect to $data_source: $DBI::errstr"; $dbh->do("set names utf8"); my $url = "http://news.sina.com.cn"; my $proce = scraper { process '//ul[@class="c_l14_01"]/following-sibling::ul[1]/li[1]/a', "news[]" => "TEXT"; process '//ul[@class="c_l14_01"]/following-sibling::ul[1]/li[1]/a', "links[]" => '@href'; }; my $res = $proce->scrape( URI->new($url)); my $sth = $dbh->prepare("INSERT into blog (title,content,url) values (?, ?, ?)"); for my $i (0..$#{$res->{links}}) { my $proce2 = scraper { process '//div[@id="artibody"]', "content[]" => 'TEXT'; }; my $res2 = $proce2->scrape(URI->new($res->{links}->[$i])); my $news = $dbh->quote($res->{news}->[$i]); my $content = $dbh->quote($res2->{content}->[0]); my $links = $dbh->quote($res->{links}->[$i]); my $tuples = $sth->execute($news,$content,$links); if ($tuples) { print "Successfully inserted $tuples records\n"; } } $dbh->disconnect;