From 7ef204135d2c441954997300f553b12767b57375 Mon Sep 17 00:00:00 2001 From: apaqi Date: Tue, 17 Jul 2018 01:25:08 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E7=BD=91=E7=BB=9C?= =?UTF-8?q?=E5=8D=8F=E8=AE=AE=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 解决网络协议问题 --- .../src/main/java/us/codecraft/webmagic/Site.java | 2 +- .../webmagic/downloader/HttpClientGenerator.java | 11 ++++++++++- .../us/codecraft/webmagic/pipeline/FilePipeline.java | 2 +- .../processor/example/BaiduBaikePageProcessor.java | 2 +- .../processor/example/GithubRepoPageProcessor.java | 4 +++- .../us/codecraft/webmagic/example/BaiduBaike.java | 2 +- .../webmagic/model/PageModelExtractorTest.java | 8 ++++---- .../codecraft/webmagic/model/ProcessorBenchmark.java | 5 ++++- 8 files changed, 25 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b6963ca43..766d08fc2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -52,7 +52,7 @@ public class Site { * * @return new site */ - public static Site me() { + public static Site me() { return new Site(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 562f36f6f..0125049b2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -14,6 +14,7 @@ import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; +import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Site; @@ -123,7 +124,15 @@ public void process( connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); - return httpClientBuilder.build(); + SSLContext ctx = null; + try { + ctx = SSLContexts.custom().useProtocol("TLSv1.2").build(); + } catch (NoSuchAlgorithmException e) { + logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e); + } catch (KeyManagementException e) { + logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e); + } + return httpClientBuilder.create().setSSLContext(ctx).build(); } private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 57d6eea3f..096e1c3a8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -22,7 +22,7 @@ * @since 0.1.0 */ @ThreadSafe -public class FilePipeline extends FilePersistentBase implements Pipeline { +public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java index f6ad87e05..2949f8c8a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java @@ -39,7 +39,7 @@ public static void main(String[] args) { //multidownload List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); - list.add(String.format(urlTemplate,"太阳能")); + // list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = spider.getAll(list); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index e93ab4cd5..8ab06ca4d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -32,6 +32,8 @@ public Site getSite() { } public static void main(String[] args) { - Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); + //https://blog.csdn.net/bbc2005/article/details/80890829 + // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run(); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 003c5730d..fa6b6a65b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -37,7 +37,7 @@ public static void main(String[] args) { //multidownload List list = new ArrayList(); list.add(String.format(urlTemplate,"风力发电")); - list.add(String.format(urlTemplate,"太阳能")); + // list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List resultItemses = ooSpider.getAll(list); diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java index f212628b4..83a91b9a6 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/PageModelExtractorTest.java @@ -31,7 +31,7 @@ public static class ModelDateStr { public static class ModelDate { - @Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) + //@Formatter(value = "yyyyMMdd", formatter = DateFormatter.class) @ExtractBy(value = "//div[@class='date']/text()", notNull = true) private Date date; @@ -53,7 +53,7 @@ public static class ModelStringList { public static class ModelIntList { - @Formatter(subClazz = Integer.class) + //@Formatter(subClazz = Integer.class) @ExtractBy("//li[@class='numbers']/text()") private List numbers; @@ -61,7 +61,7 @@ public static class ModelIntList { public static class ModelDateList { - @Formatter(subClazz = Date.class, value = "yyyyMMdd") + ///@Formatter(subClazz = Date.class, value = "yyyyMMdd") @ExtractBy("//li[@class='dates']/text()") private List dates; @@ -69,7 +69,7 @@ public static class ModelDateList { public static class ModelCustomList { - @Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class) + //@Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class) @ExtractBy("//li[@class='dates']/text()") private List dates; diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java index 7c6192692..68a6fedc6 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/model/ProcessorBenchmark.java @@ -21,7 +21,8 @@ public void test() { Page page = new Page(); page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); - page.setHtml(new Html(html)); + //page.setHtml(new Html(html)); + page.setHtml(new Html("")); long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); @@ -34,6 +35,7 @@ public void test() { System.out.println(System.currentTimeMillis() - time); } +/* private String html = "\n" + "\n" + "\n" + @@ -888,4 +890,5 @@ public void test() { "\n" + "\n" + ""; +*/ } From 78a16218315a68204e26342333813df48c44b4cf Mon Sep 17 00:00:00 2001 From: apaqi Date: Thu, 19 Jul 2018 01:00:59 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E5=9B=BE=E7=89=87=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 图片下载 --- .../processor/example/DownImgUtil.java | 83 +++++++++++++++++++ .../example/GithubRepoPageProcessor.java | 16 +++- .../processor/example/ImgPageProcessor.java | 77 +++++++++++++++++ 3 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java new file mode 100644 index 000000000..d4a285064 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/DownImgUtil.java @@ -0,0 +1,83 @@ +package us.codecraft.webmagic.processor.example; + + +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; + +/** + * Created by wpx on 2018/7/17. + */ +public class DownImgUtil { + /** + * 从网络Url中下载文件 + * @param urlStr + * @param fileName + * @param savePath + * @throws IOException + */ + public static void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{ + System.setProperty("javax.net.debug", "all"); + + URL url = new URL(urlStr); + HttpURLConnection conn = (HttpURLConnection)url.openConnection(); + conn.setRequestMethod("GET"); + //设置超时间为3秒 + conn.setConnectTimeout(3*1000); + //防止屏蔽程序抓取而返回403错误 + conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); + + //得到输入流 + InputStream inputStream = conn.getInputStream(); + //获取自己数组 + byte[] getData = readInputStream(inputStream); + + //文件保存位置 + File saveDir = new File(savePath); + if(!saveDir.exists()){ + saveDir.mkdir(); + } + File file = new File(saveDir+File.separator+fileName); + FileOutputStream fos = new FileOutputStream(file); + fos.write(getData); + if(fos!=null){ + fos.close(); + } + if(inputStream!=null){ + inputStream.close(); + } + + + System.out.println("info:"+url+" download success"); + + } + + + + /** + * 从输入流中获取字节数组 + * @param inputStream + * @return + * @throws IOException + */ + public static byte[] readInputStream(InputStream inputStream) throws IOException { + byte[] buffer = new byte[1024]; + int len = 0; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + while((len = inputStream.read(buffer)) != -1) { + bos.write(buffer, 0, len); + } + bos.close(); + return bos.toByteArray(); + } + + public static void main(String[] args) { + try{ + downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg"); + }catch (Exception e) { + // TODO: handle exception + System.out.println(""); + } + } + +} \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index 8ab06ca4d..5b65b7b08 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -6,17 +6,21 @@ import us.codecraft.webmagic.processor.PageProcessor; /** + * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。 + * 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 + * * @author code4crafter@gmail.com
* @since 0.3.2 */ public class GithubRepoPageProcessor implements PageProcessor { + // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); + // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); - page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); + // 部分二:定义如何抽取页面信息,并保存下来 page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ @@ -24,6 +28,13 @@ public void process(Page page) { page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + // 部分三:从页面发现后续的url地址来抓取 + /** + * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)" + * 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。 + */ + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); } @Override @@ -33,6 +44,7 @@ public Site getSite() { public static void main(String[] args) { //https://blog.csdn.net/bbc2005/article/details/80890829 + //https://www.cnblogs.com/sunny08/p/8038440.html // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java new file mode 100644 index 000000000..04d740eff --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ImgPageProcessor.java @@ -0,0 +1,77 @@ +package us.codecraft.webmagic.processor.example; + +import org.apache.commons.collections.CollectionUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +/** + * 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。 + * 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 + * + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class ImgPageProcessor implements PageProcessor { + // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 + + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); + + // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 + @Override + public void process(Page page) { + // 部分二:定义如何抽取页面信息,并保存下来 + String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString(); + String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString(); + String urlpre = ""; + URL url = null; + try { + url = new URL(page.getUrl().toString()); + System.out.println(""); + urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1]; + } catch (MalformedURLException e) { + e.printStackTrace(); + } + if(null!=imgUrl && ""!= imgUrl && ""!=imgName){ + try { + DownImgUtil.downLoadFromUrl(imgUrl, imgName,""); + } catch (IOException e) { + e.printStackTrace(); + } + } + + + // 部分三:从页面发现后续的url地址来抓取 + /** + * page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)" + * 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。 + */ + List urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all(); + List handledUrls = new ArrayList(); + if(CollectionUtils.isNotEmpty(urls)) { + for(String temp : urls) { + handledUrls.add(urlpre+temp); + } + } + page.addTargetRequests(handledUrls); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + //https://blog.csdn.net/bbc2005/article/details/80890829 + //https://www.cnblogs.com/sunny08/p/8038440.html + // System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 + Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run(); + } +} From e4ff496615b7991ae9a268ed4f0f4b4716acf4eb Mon Sep 17 00:00:00 2001 From: apaqi Date: Sun, 23 Aug 2020 13:18:12 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=99=BE=E5=BA=A6?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E6=90=9C=E7=B4=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加百度图片搜索 --- pom.xml | 5 + webmagic-core/pom.xml | 5 +- .../webmagic/pipeline/FilePipeline.java | 1 + .../BaiduPictureDownloadProcesser.java | 193 ++++++++++++++++++ 4 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java diff --git a/pom.xml b/pom.xml index d016d0a92..115e4d952 100644 --- a/pom.xml +++ b/pom.xml @@ -193,6 +193,11 @@ jedis 2.9.3 + + net.jcip + jcip-annotations + 1.0 + diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 44fb7fa4d..a5c30982c 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -80,7 +80,10 @@ com.alibaba fastjson - + + net.jcip + jcip-annotations + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 964e5fd5d..409c5775a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.pipeline; +import net.jcip.annotations.ThreadSafe; import org.apache.commons.codec.digest.DigestUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java new file mode 100644 index 000000000..a9cdb1e27 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/BaiduPictureDownloadProcesser.java @@ -0,0 +1,193 @@ +package us.codecraft.webmagic.samples; + +import com.alibaba.fastjson.JSONArray; +import com.alibaba.fastjson.JSONObject; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.Exchanger; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +public class BaiduPictureDownloadProcesser implements PageProcessor { + ExecutorService executorService = Executors.newFixedThreadPool(10); + + private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000) + .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0") + .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3") + .setCharset("UTF-8"); + private final static Map KEY_WORDS = new HashMap<>(); + + static { + //奶制品 + KEY_WORDS.put("牛奶", "牛奶"); + KEY_WORDS.put("奶酪", "奶酪"); + KEY_WORDS.put("酸奶", "酸奶"); + //哺乳动物肉 + KEY_WORDS.put("羊肉", "肉"); + KEY_WORDS.put("牛肉", "肉"); + KEY_WORDS.put("狗肉", "肉"); + KEY_WORDS.put("驴肉", "肉"); + KEY_WORDS.put("猪肉", "肉"); + //家禽肉 + KEY_WORDS.put("鸡", "家禽肉"); + KEY_WORDS.put("鸭", "家禽肉"); + KEY_WORDS.put("鹅", "家禽肉"); + //蛋类 + KEY_WORDS.put("鸡蛋", "蛋"); + KEY_WORDS.put("鸭蛋", "蛋"); + KEY_WORDS.put("鸽子蛋", "蛋"); + //蔬菜 + KEY_WORDS.put("冬瓜", "冬瓜"); + KEY_WORDS.put("西红柿", "西红柿"); + KEY_WORDS.put("苦瓜", "苦瓜"); + KEY_WORDS.put("青椒", "青椒"); + KEY_WORDS.put("胡萝卜", "胡萝卜"); + KEY_WORDS.put("南瓜", "南瓜"); + KEY_WORDS.put("玉米", "玉米"); + KEY_WORDS.put("秋葵", "秋葵"); + KEY_WORDS.put("西兰花", "西兰花"); + KEY_WORDS.put("生姜", "生姜"); + //水果 + KEY_WORDS.put("苹果", "苹果"); + KEY_WORDS.put("梨", "梨"); + KEY_WORDS.put("香蕉", "香蕉"); + KEY_WORDS.put("葡萄", "葡萄"); + KEY_WORDS.put("榴莲", "榴莲"); + KEY_WORDS.put("猕猴桃", "猕猴桃"); + KEY_WORDS.put("哈密瓜", "哈密瓜"); + KEY_WORDS.put("草莓", "草莓"); + KEY_WORDS.put("橘子", "橘子"); + KEY_WORDS.put("菠萝", "菠萝"); + KEY_WORDS.put("山楂", "山楂"); + KEY_WORDS.put("桂圆", "桂圆"); + //水产品 + KEY_WORDS.put("虾", "虾"); + KEY_WORDS.put("蟹", "蟹"); + KEY_WORDS.put("鱼", "鱼"); + KEY_WORDS.put("贝类", "贝类"); + KEY_WORDS.put("螺类", "螺类"); + KEY_WORDS.put("海参类", "海参类"); + // + KEY_WORDS.put("豆皮", "豆皮"); + KEY_WORDS.put("豆腐脑", "豆腐脑"); + KEY_WORDS.put("豆干", "豆干"); + KEY_WORDS.put("豆腐", "豆腐"); + //坚果 + KEY_WORDS.put("腰果", "腰果"); + KEY_WORDS.put("开心果", "开心果"); + KEY_WORDS.put("核桃", "核桃"); + KEY_WORDS.put("葡萄干", "葡萄干"); + KEY_WORDS.put("夏威夷果", "夏威夷果"); + } + + @Override + public void process(Page page) { + List url_list = new ArrayList<>(); + List name_list = new ArrayList<>(); + JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText()); + JSONArray data = (JSONArray) jsonObject.get("data"); + for (int i = 0; i < data.size(); i++) { + String url = (String) data.getJSONObject(i).get("thumbURL"); + String name = (String) data.getJSONObject(i).get("fromPageTitleEnc"); + if (url != null) { + url_list.add(url); + name_list.add(name); + } + } + setUrls(url_list); + setNames(name_list); + + } + + @Override + public Site getSite() { + return this.site; + } + + private void downloadPicture(List urlList, String key, String keyName) { + URL url = null; + for (int i = 0; i < urlList.size(); i++) { + try { + url = new URL(urlList.get(i)); + DataInputStream dataInputStream = new DataInputStream(url.openStream()); + String imageName = i + ".jpg"; + createDir("d:\\pic\\" + keyName); + File file = new File("d:\\pic\\" + keyName); //设置下载路径 + if (!file.isDirectory()) { + file.mkdirs(); + } + FileOutputStream fileOutputStream = new FileOutputStream(new File("d:\\pic\\" + keyName + "\\" + imageName.trim())); + byte[] buffer = new byte[1024]; + int length; + while ((length = dataInputStream.read(buffer)) > 0) { + fileOutputStream.write(buffer, 0, length); + } + dataInputStream.close(); + fileOutputStream.close(); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + public static void main(String[] args) { + + BaiduPictureDownloadProcesser downloadPicture = new BaiduPictureDownloadProcesser(); + for (Map.Entry entry : KEY_WORDS.entrySet()) { + List urlList = new CopyOnWriteArrayList(); + for (int i = 0; i < 2; i++) { //控制爬取页数,一页10张图片 + String url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&queryWord=" + entry.getKey() + "&word=" + entry.getKey() + "&pn=" + i * 10 + "0"; + Spider.create(new BaiduPictureDownloadProcesser()) + .addUrl(url) + .run(); + urlList.addAll(urls); + } + downloadPicture.downloadPicture(urlList, entry.getKey(), entry.getValue()); + } + + } + + static List urls; + static List names; + + public void setUrls(List urls) { + this.urls = urls; + } + + public void setNames(List names) { + this.names = names; + } + + /** + * 创建文件夹 + * + * @param dir 当前文件夹 + */ + private void createDir(String dir) throws IOException { + Path path = Paths.get(dir); + if (!Files.exists(path)) { + Files.createDirectories(path); + } + + } +}