欢迎来到 IT实训基地-南通科迅教育
咨询电话:0513-81107100
Jsoup HttpClient 抓取网络上的图
2016/5/8
科迅教育
790
南通IT培训班报价
Jsoup HttpClient 抓取网络上的图片
分类: java 2011-07-27 13:20 643人阅读 评论(0) 收藏 举报
网络exceptionstringurlclassthread
[java] view plaincopyprint?
package com.th.spider.test; 


import java.io.BufferedOutputStream; 
import java.io.FileOutputStream; 

import org.apache.commons.logging.Log; 
import org.apache.commons.logging.LogFactory; 
import org.apache.http.HttpEntity; 
import org.apache.http.HttpResponse; 
import org.apache.http.HttpStatus; 
import org.apache.http.client.HttpClient; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.impl.client.DefaultHttpClient; 
import org.apache.http.params.CoreConnectionPNames; 
import org.apache.http.util.EntityUtils; 
import org.jsoup.Connection; 
import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.select.Elements; 

public class Exmaple3 { 

private static final Log log = LogFactory.getLog(Exmaple3.class); 

/** 
* 抓取图片存放目录 
*/ 
private static final String PIC_DIR = "/home/li/pic"; 

/** 
* 链接超时 
*/ 
private static final int TIME_OUT = 5000; 

static void go3(String url) throws Exception { 
Connection conn= Jsoup.connect(url); 
Document doc = conn.get(); 
Elements links = doc.select("div.piclist img[src]"); 
for(int i=0;i<links.size();i++){ 
Element element = links.get(i); 
final String imgUrl = element.attr("src"); 
log.info(imgUrl); 
Thread.sleep(500); 
new Thread(new Runnable() { 
public void run() { 
try { 
save(imgUrl); 
} catch (Exception e) { 
// TODO Auto-generated catch block 
e.printStackTrace(); 


}).start(); 



static void go2(String url) throws Exception { 
Connection conn= Jsoup.connect(url); 
Document doc = conn.get(); 
Elements links = doc.select("div.cc a[href]"); 
for(int i=0;i<links.size();i++){ 
Element element = links.get(i); 
final String dirUrl = "http://www.3lian.com"+element.attr("href"); 
log.info(dirUrl); 
Thread.sleep(500); 
new Thread(new Runnable() { 
public void run() { 
try { 
Connection conn= Jsoup.connect(dirUrl); 
Document doc = conn.get(); 
Elements images = doc.select("div.mb_jjnr img[src]"); 
for(int j=0;j<images.size();j++){ 
Element img = images.get(j); 
String imgUrl = img.attr("src"); 
log.info(imgUrl); 
save(imgUrl); 

} catch (Exception e) { 
e.printStackTrace(); 


}).start(); 



/** 
* 处理帖子URL 
* @param url 
* @throws Exception 
*/ 
static void go(String url) throws Exception { 
// JSOP创建链接 
Connection conn = Jsoup.connect(url); 
// 请求返回整个文档对象 
Document doc = conn.post(); 
// 选择所有class=zoom 的img标签对象 
Elements imgs = doc.select("img[class=zoom]"); 
// 循环每个img标签 
for (int i = 0; i < imgs.size(); i++) { 
Element img = imgs.get(i); 
// 取得图片的下载地址 
String picURL = doc.baseUri() + img.attr("file"); 
log.info(picURL); 
// 保存图片 
save(picURL); 



//<img src="static/image/common/none.gif" file="data/attachment/forum/201105/08/174412nz3jq4z90s33s2t0.jpg" width="770" class="zoom" onclick="zoom(this, this.src)" id="aimg_180565" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="img_src_29620.jpg" title="img_src_29620.jpg" /> 
//doc.select("img[class=zoom]") 
/** 
* 保存图片 
* @param url 
* @param i 
* @throws Exception 
*/ 
static void save(String url) throws Exception { 
String fileName = url.substring(url.lastIndexOf("/")); 
String filePath = PIC_DIR + "/" + fileName; 
BufferedOutputStream out = null; 
byte[] bit = getByte(url); 
if (bit.length > 0) { 
try { 
out = new BufferedOutputStream(new FileOutputStream(filePath)); 
out.write(bit); 
out.flush(); 
log.info("Create File success! [" + filePath + "]"); 
} finally { 
if (out != null) 
out.close(); 




/** 
* 获取图片字节流 
* @param uri 
* @return 
* @throws Exception 
*/ 
static byte[] getByte(String uri) throws Exception { 
HttpClient client = new DefaultHttpClient(); 
client.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT); 
HttpGet get = new HttpGet(uri); 
get.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT); 
try { 
HttpResponse resonse = client.execute(get); 
if (resonse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { 
HttpEntity entity = resonse.getEntity(); 
if (entity != null) { 
return EntityUtils.toByteArray(entity); 


} catch (Exception e) { 
e.printStackTrace(); 
} finally { 
client.getConnectionManager().shutdown(); 

return new byte[0]; 


public static void main(String[] args) throws Exception { 
// 开始抓取图片 
go2("http://www.3lian.com/gif/more/03/0301.html"); 
//go3("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969/"); 

}
77
关闭
先学习,后交费申请表
每期5位名额
在线咨询
免费电话
QQ联系
先学习,后交费
TOP
您好,您想咨询哪门课程呢?
关于我们
机构简介
官方资讯
地理位置
联系我们
0513-91107100
周一至周六     8:30-21:00
微信扫我送教程
手机端访问
南通科迅教育信息咨询有限公司     苏ICP备15009282号     联系地址:江苏省南通市人民中路23-6号新亚大厦三楼             法律顾问:江苏瑞慈律师事务所     Copyright 2008-