java 采集器基础【源码】
采用Java爬虫框架WebMagic,灵活,简单;测试采集地址是:http://www.jfinal.com/project效果:http://img.woaidaima.com//upload/image/20170408/1491624047540011541.jpgjava代码:(所用到的jar包在源码里面)
public class SpiderTest implements PageProcessor {
private Site page = Site.me().setRetryTimes(3).setSleepTime(1000);
/* 启动蜘蛛 */
public static void main(String[] args) {
Spider.create(new SpiderTest()).addUrl("http://www.jfinal.com/project").thread(5).run();
}
@Override
public Site getSite() {
return page;
}
@Override
public void process(Page page) {
/* 获取html源码 */
Html html = page.getHtml();
/* 使用xpath获得标题和链接 */
List hrefs = html.xpath("//div[@class='jf-panel-item']/h3/a/@href").all();
Listtitles = html.xpath("//div[@class='jf-panel-item']/h3/a/text()").all();
for (int i = 0; i < titles.size(); i++) {
System.out.println("标题:" + titles.get(i) + "\t\t\t链接:" + hrefs.get(i));
}
}
}
页:
[1]