Jsoup实现简单爬虫

Jsoup爬虫

jsoup 是一个用于处理真实世界 HTML 的 Java 库。它使用最好的 HTML5 DOM 方法和 CSS 选择器提供了一个非常方便的 API,用于获取 URL 以及提取和操作数据。

  • pom依赖
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
<!-- slf4j 日志门面 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.30</version>
</dependency>

<!-- logback 日志实现 -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>

<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.0.M4</version>
</dependency>

<dependency>
<groupId>com.aliyun.oss</groupId>
<artifactId>aliyun-sdk-oss</artifactId>
<version>3.10.2</version>
</dependency>
  • JsoupDemo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
public class JsoupDemo {

public static final Logger log = LoggerFactory.getLogger(JsoupDemo.class);

public static void main(String[] args) throws IOException {
Document doc = Jsoup.connect("http://10.1.20.55/gt/").get();
log.info(doc.title());
Elements newsHeadlines = doc.select("#column-list ul li");
for (Element headline : newsHeadlines) {
String href = headline.children().attr("href");

Document archivePage = Jsoup.connect("http://10.1.20.55" + href).get();
log.info("|||---------------------------------------------------|||");
String title = archivePage.title();
log.info("《《《 课程专题: {} 》》》", title);

File f2 = new File("E:\\study\\极客时间\\" + title);
if (!f2.exists()) {
f2.mkdir();
}

Elements postList = archivePage.select("#column-content-list ul li");
for (int i = 0; i < postList.size(); i++) {
String itemHref = postList.get(i).children().attr("href");
if (StrUtil.isBlank(itemHref)) {
continue;
}
Document post = Jsoup.connect("http://10.1.20.55" + itemHref).get();
log.info("=== 文章标题: {} ===", post.title());

// File f1 = new File("E:\\study\\极客时间\\" + title + "\\" + i + "-" + post.title() + ".md");
// if (!f1.exists()) {
// boolean flag1 = f1.createNewFile();
// }

Elements image = post.select("#divimage #img-content");
String src = image.get(0).attr("src");
log.info(src);


String url = AliyunOSSUtils.uploadFile(IdUtil.getSnowflakeNextId() + "_" + StrUtil.subAfter(src, "/", true), "http://10.1.20.55" + src);
System.out.println(url);
FileWriter writer = new FileWriter("E:\\study\\极客时间\\" + title + "\\" + i + "-" + post.title() + ".md");
writer.write("![image-20220505095253102](" + url + ")");
writer.flush();


}
}
}
}
  • AliyunOSSUtils
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
public class AliyunOSSUtils {

private static final String ENDPOINT = "oss-cn-beijing.aliyuncs.com";
private static final String ACCESSKEY_ID = "LTAI4FofEpjtgLaKRaRAZqe4";
private static final String ACCESSKEY_SECRET = "a6u8B0OANXReNcGfyDdLFK97a7xiaK";
private static final String BUCKET_NAME = "haopeng-jksj";


public static String uploadFile(String uniqueFileName, String path) throws IOException {
java.net.URL url = new java.net.URL(path);
URLConnection conn = url.openConnection();
InputStream in = conn.getInputStream();
uploadFile(uniqueFileName, in);
return "https://" + BUCKET_NAME + "." + ENDPOINT + "/" + uniqueFileName;
}

public static void uploadFile(String uniqueFileName, InputStream in) {
OSS ossClient = new OSSClientBuilder().build("https://" + ENDPOINT, ACCESSKEY_ID, ACCESSKEY_SECRET);
try {
// PutObjectRequest putObjectRequest = new PutObjectRequest(BUCKET_NAME, uniqueFileName, in);

// 如果需要上传时设置存储类型和访问权限,请参考以下示例代码。
// ObjectMetadata metadata = new ObjectMetadata();
// metadata.setHeader(OSSHeaders.OSS_STORAGE_CLASS, StorageClass.Standard.toString());
// metadata.setObjectAcl(CannedAccessControlList.Private);
// putObjectRequest.setMetadata(metadata);

ossClient.putObject(BUCKET_NAME, uniqueFileName, in);

} catch (OSSException oe) {
System.out.println("Caught an OSSException, which means your request made it to OSS, "
+ "but was rejected with an error response for some reason.");
System.out.println("Error Message:" + oe.getErrorMessage());
System.out.println("Error Code:" + oe.getErrorCode());
System.out.println("Request ID:" + oe.getRequestId());
System.out.println("Host ID:" + oe.getHostId());
} catch (ClientException ce) {
System.out.println("Caught an ClientException, which means the client encountered "
+ "a serious internal problem while trying to communicate with OSS, "
+ "such as not being able to access the network.");
System.out.println("Error Message:" + ce.getMessage());
} finally {
if (ossClient != null) {
ossClient.shutdown();
}
}
}


}
  • logback.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<!-- 配置集中管理属性, 可以直接使用该属性的值,格式:${name}-->
<property name="pattern" value="%d{yyyy-MM-dd HH:mm:ss.SSS} [%-5level] %c %M %L [%thread] %m%n"/>
<!--
日志输出格式:
%-5level
%d{yyyy-MM-dd HH:mm:ss.SSS}日期
%c类的完整名称
%M为method
%L为行号
%thread线程名称
%m或者%msg为信息
%n换行
-->

<!--
格式化输出:
%d表示日期,
%thread表示线程名,
%-5level:级别从左显示5个字符宽度
%msg:日志消息
%n是换行符
-->

<!-- 控制台日志输出的 appender-->

<!--每个标签的编写都是使用类的 set 方法 如 target 标签对应 setTarget 方法-->
<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<!-- 控制输出流对象 默认是 System.out 可以改为 System.err 区别在于:字体颜色不同-->
<target>System.out</target>
<!-- 日志消息格式配置-->
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>${pattern}</pattern>
</encoder>
</appender>

<!-- root logger 配置-->
<root level="ALL">
<appender-ref ref="console"></appender-ref>
</root>

</configuration>