Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

更新内容 #944

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,11 @@
<artifactId>jedis</artifactId>
<version>2.9.3</version>
</dependency>
<dependency>
<groupId>net.jcip</groupId>
<artifactId>jcip-annotations</artifactId>
<version>1.0</version>
</dependency>
</dependencies>
</dependencyManagement>

Expand Down
5 changes: 4 additions & 1 deletion webmagic-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>

<dependency>
<groupId>net.jcip</groupId>
<artifactId>jcip-annotations</artifactId>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public class Site {
*
* @return new site
*/
public static Site me() {
public static Site me() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个修改是没有必要的。

return new Site();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContexts;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -142,7 +143,15 @@ public void process(
connectionManager.setDefaultSocketConfig(socketConfig);
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
SSLContext ctx = null;
try {
ctx = SSLContexts.custom().useProtocol("TLSv1.2").build();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为什么要强制使用 TLSv1.2?构造函数里不是已经注册了 SSLConnectionSocketFactory 了吗?

} catch (NoSuchAlgorithmException e) {
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e);
} catch (KeyManagementException e) {
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e);
}
return httpClientBuilder.create().setSSLContext(ctx).build();
}

private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package us.codecraft.webmagic.pipeline;

import net.jcip.annotations.ThreadSafe;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -19,7 +20,9 @@
* @author [email protected] <br>
* @since 0.1.0
*/
public class FilePipeline extends FilePersistentBase implements Pipeline {
@ThreadSafe
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个 annotation 在哪里会用到?

public class FilePipeline extends FilePersistentBase implements Pipeline {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

format 不正确。



private Logger logger = LoggerFactory.getLogger(getClass());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public static void main(String[] args) {
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
// list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package us.codecraft.webmagic.processor.example;


import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;

/**
* Created by wpx on 2018/7/17.
*/
public class DownImgUtil {
/**
* 从网络Url中下载文件
* @param urlStr
* @param fileName
* @param savePath
* @throws IOException
*/
public static void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{
System.setProperty("javax.net.debug", "all");

URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
conn.setRequestMethod("GET");
//设置超时间为3秒
conn.setConnectTimeout(3*1000);
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");

//得到输入流
InputStream inputStream = conn.getInputStream();
//获取自己数组
byte[] getData = readInputStream(inputStream);

//文件保存位置
File saveDir = new File(savePath);
if(!saveDir.exists()){
saveDir.mkdir();
}
File file = new File(saveDir+File.separator+fileName);
FileOutputStream fos = new FileOutputStream(file);
fos.write(getData);
if(fos!=null){
fos.close();
}
if(inputStream!=null){
inputStream.close();
}


System.out.println("info:"+url+" download success");

}



/**
* 从输入流中获取字节数组
* @param inputStream
* @return
* @throws IOException
*/
public static byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}

public static void main(String[] args) {
try{
downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg");
}catch (Exception e) {
// TODO: handle exception
System.out.println("");
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,35 @@
import us.codecraft.webmagic.processor.PageProcessor;

/**
* 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
* 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。
*
* @author [email protected] <br>
* @since 0.3.2
*/
public class GithubRepoPageProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
// 部分二:定义如何抽取页面信息,并保存下来
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
// 部分三:从页面发现后续的url地址来抓取
/**
* page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
* 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
*/
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
}

@Override
Expand All @@ -32,6 +43,9 @@ public Site getSite() {
}

public static void main(String[] args) {
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
//https://blog.csdn.net/bbc2005/article/details/80890829
//https://www.cnblogs.com/sunny08/p/8038440.html
// System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package us.codecraft.webmagic.processor.example;

import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
* 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。
* 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。
*
* @author [email protected] <br>
* @since 0.3.2
*/
public class ImgPageProcessor implements PageProcessor {
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
@Override
public void process(Page page) {
// 部分二:定义如何抽取页面信息,并保存下来
String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString();
String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString();
String urlpre = "";
URL url = null;
try {
url = new URL(page.getUrl().toString());
System.out.println("");
urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1];
} catch (MalformedURLException e) {
e.printStackTrace();
}
if(null!=imgUrl && ""!= imgUrl && ""!=imgName){
try {
DownImgUtil.downLoadFromUrl(imgUrl, imgName,"");
} catch (IOException e) {
e.printStackTrace();
}
}


// 部分三:从页面发现后续的url地址来抓取
/**
* page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)"
* 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。
*/
List<String> urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all();
List<String> handledUrls = new ArrayList<String>();
if(CollectionUtils.isNotEmpty(urls)) {
for(String temp : urls) {
handledUrls.add(urlpre+temp);
}
}
page.addTargetRequests(handledUrls);
}

@Override
public Site getSite() {
return site;
}

public static void main(String[] args) {
//https://blog.csdn.net/bbc2005/article/details/80890829
//https://www.cnblogs.com/sunny08/p/8038440.html
// System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息
Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public static void main(String[] args) {
//multidownload
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
// list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"地热发电"));
List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public static class ModelDateStr {

public static class ModelDate {

@Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
//@Formatter(value = "yyyyMMdd", formatter = DateFormatter.class)
@ExtractBy(value = "//div[@class='date']/text()", notNull = true)
private Date date;

Expand All @@ -53,23 +53,23 @@ public static class ModelStringList {

public static class ModelIntList {

@Formatter(subClazz = Integer.class)
//@Formatter(subClazz = Integer.class)
@ExtractBy("//li[@class='numbers']/text()")
private List<Integer> numbers;

}

public static class ModelDateList {

@Formatter(subClazz = Date.class, value = "yyyyMMdd")
///@Formatter(subClazz = Date.class, value = "yyyyMMdd")
@ExtractBy("//li[@class='dates']/text()")
private List<Date> dates;

}

public static class ModelCustomList {

@Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
//@Formatter(subClazz = Date.class, value = "yyyyMMdd",formatter = DateFormatter.class)
@ExtractBy("//li[@class='dates']/text()")
private List<Date> dates;

Expand Down
Loading