-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
更新内容 #944
base: master
Are you sure you want to change the base?
更新内容 #944
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ | |
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; | ||
import org.apache.http.impl.cookie.BasicClientCookie; | ||
import org.apache.http.protocol.HttpContext; | ||
import org.apache.http.ssl.SSLContexts; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -142,7 +143,15 @@ public void process( | |
connectionManager.setDefaultSocketConfig(socketConfig); | ||
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); | ||
generateCookie(httpClientBuilder, site); | ||
return httpClientBuilder.build(); | ||
SSLContext ctx = null; | ||
try { | ||
ctx = SSLContexts.custom().useProtocol("TLSv1.2").build(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么要强制使用 TLSv1.2?构造函数里不是已经注册了 SSLConnectionSocketFactory 了吗? |
||
} catch (NoSuchAlgorithmException e) { | ||
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException,{}", e); | ||
} catch (KeyManagementException e) { | ||
logger.warn("CloseableHttpClient getClient #NoSuchAlgorithmException ,{} error", e); | ||
} | ||
return httpClientBuilder.create().setSSLContext(ctx).build(); | ||
} | ||
|
||
private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
package us.codecraft.webmagic.pipeline; | ||
|
||
import net.jcip.annotations.ThreadSafe; | ||
import org.apache.commons.codec.digest.DigestUtils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
@@ -19,7 +20,9 @@ | |
* @author [email protected] <br> | ||
* @since 0.1.0 | ||
*/ | ||
public class FilePipeline extends FilePersistentBase implements Pipeline { | ||
@ThreadSafe | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个 annotation 在哪里会用到? |
||
public class FilePipeline extends FilePersistentBase implements Pipeline { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. format 不正确。 |
||
|
||
|
||
private Logger logger = LoggerFactory.getLogger(getClass()); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package us.codecraft.webmagic.processor.example; | ||
|
||
|
||
import java.io.*; | ||
import java.net.HttpURLConnection; | ||
import java.net.URL; | ||
|
||
/** | ||
* Created by wpx on 2018/7/17. | ||
*/ | ||
public class DownImgUtil { | ||
/** | ||
* 从网络Url中下载文件 | ||
* @param urlStr | ||
* @param fileName | ||
* @param savePath | ||
* @throws IOException | ||
*/ | ||
public static void downLoadFromUrl(String urlStr,String fileName,String savePath) throws IOException{ | ||
System.setProperty("javax.net.debug", "all"); | ||
|
||
URL url = new URL(urlStr); | ||
HttpURLConnection conn = (HttpURLConnection)url.openConnection(); | ||
conn.setRequestMethod("GET"); | ||
//设置超时间为3秒 | ||
conn.setConnectTimeout(3*1000); | ||
//防止屏蔽程序抓取而返回403错误 | ||
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); | ||
|
||
//得到输入流 | ||
InputStream inputStream = conn.getInputStream(); | ||
//获取自己数组 | ||
byte[] getData = readInputStream(inputStream); | ||
|
||
//文件保存位置 | ||
File saveDir = new File(savePath); | ||
if(!saveDir.exists()){ | ||
saveDir.mkdir(); | ||
} | ||
File file = new File(saveDir+File.separator+fileName); | ||
FileOutputStream fos = new FileOutputStream(file); | ||
fos.write(getData); | ||
if(fos!=null){ | ||
fos.close(); | ||
} | ||
if(inputStream!=null){ | ||
inputStream.close(); | ||
} | ||
|
||
|
||
System.out.println("info:"+url+" download success"); | ||
|
||
} | ||
|
||
|
||
|
||
/** | ||
* 从输入流中获取字节数组 | ||
* @param inputStream | ||
* @return | ||
* @throws IOException | ||
*/ | ||
public static byte[] readInputStream(InputStream inputStream) throws IOException { | ||
byte[] buffer = new byte[1024]; | ||
int len = 0; | ||
ByteArrayOutputStream bos = new ByteArrayOutputStream(); | ||
while((len = inputStream.read(buffer)) != -1) { | ||
bos.write(buffer, 0, len); | ||
} | ||
bos.close(); | ||
return bos.toByteArray(); | ||
} | ||
|
||
public static void main(String[] args) { | ||
try{ | ||
downLoadFromUrl("http://img1.mm131.me/pic/4170/15.jpg", "百度.jpg","D:\\webimg"); | ||
}catch (Exception e) { | ||
// TODO: handle exception | ||
System.out.println(""); | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,24 +6,35 @@ | |
import us.codecraft.webmagic.processor.PageProcessor; | ||
|
||
/** | ||
* 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。 | ||
* 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 | ||
* | ||
* @author [email protected] <br> | ||
* @since 0.3.2 | ||
*/ | ||
public class GithubRepoPageProcessor implements PageProcessor { | ||
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
|
||
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); | ||
|
||
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
@Override | ||
public void process(Page page) { | ||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); | ||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); | ||
// 部分二:定义如何抽取页面信息,并保存下来 | ||
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); | ||
page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); | ||
if (page.getResultItems().get("name")==null){ | ||
//skip this page | ||
page.setSkip(true); | ||
} | ||
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); | ||
// 部分三:从页面发现后续的url地址来抓取 | ||
/** | ||
* page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)" | ||
* 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。 | ||
*/ | ||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); | ||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); | ||
} | ||
|
||
@Override | ||
|
@@ -32,6 +43,9 @@ public Site getSite() { | |
} | ||
|
||
public static void main(String[] args) { | ||
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); | ||
//https://blog.csdn.net/bbc2005/article/details/80890829 | ||
//https://www.cnblogs.com/sunny08/p/8038440.html | ||
// System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 | ||
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(1).run(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package us.codecraft.webmagic.processor.example; | ||
|
||
import org.apache.commons.collections.CollectionUtils; | ||
import us.codecraft.webmagic.Page; | ||
import us.codecraft.webmagic.Site; | ||
import us.codecraft.webmagic.Spider; | ||
import us.codecraft.webmagic.processor.PageProcessor; | ||
|
||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* 这部分我们直接通过GithubRepoPageProcessor这个例子来介绍PageProcessor的编写方式。 | ||
* 我将PageProcessor的定制分为三个部分,分别是爬虫的配置、页面元素的抽取和链接的发现。 | ||
* | ||
* @author [email protected] <br> | ||
* @since 0.3.2 | ||
*/ | ||
public class ImgPageProcessor implements PageProcessor { | ||
// 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 | ||
|
||
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); | ||
|
||
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 | ||
@Override | ||
public void process(Page page) { | ||
// 部分二:定义如何抽取页面信息,并保存下来 | ||
String imgName = page.getHtml().xpath("//div[@class='content-pic']/a/img//@alt").toString(); | ||
String imgUrl = page.getHtml().xpath("//div[@class='content-pic']/a/img//@src").toString(); | ||
String urlpre = ""; | ||
URL url = null; | ||
try { | ||
url = new URL(page.getUrl().toString()); | ||
System.out.println(""); | ||
urlpre = url.getProtocol()+"://" + url.getHost() +"/"+ url.getPath().split("/")[1]; | ||
} catch (MalformedURLException e) { | ||
e.printStackTrace(); | ||
} | ||
if(null!=imgUrl && ""!= imgUrl && ""!=imgName){ | ||
try { | ||
DownImgUtil.downLoadFromUrl(imgUrl, imgName,""); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} | ||
} | ||
|
||
|
||
// 部分三:从页面发现后续的url地址来抓取 | ||
/** | ||
* page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()用于获取所有满足"(https:/ /github\.com/\w+/\w+)" | ||
* 这个正则表达式的链接,page.addTargetRequests()则将这些链接加入到待抓取的队列中去。 | ||
*/ | ||
List<String> urls = page.getHtml().links().xpath("/html/body/div[6]/div[3]//@href").all(); | ||
List<String> handledUrls = new ArrayList<String>(); | ||
if(CollectionUtils.isNotEmpty(urls)) { | ||
for(String temp : urls) { | ||
handledUrls.add(urlpre+temp); | ||
} | ||
} | ||
page.addTargetRequests(handledUrls); | ||
} | ||
|
||
@Override | ||
public Site getSite() { | ||
return site; | ||
} | ||
|
||
public static void main(String[] args) { | ||
//https://blog.csdn.net/bbc2005/article/details/80890829 | ||
//https://www.cnblogs.com/sunny08/p/8038440.html | ||
// System.setProperty("javax.net.debug", "all"); //打印网络连接握手信息 | ||
Spider.create(new ImgPageProcessor()).addUrl("http://www.mm131.com/xinggan/4170.html").thread(1).run(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个修改是没有必要的。