Jsoup实现简单爬虫

Jsoup爬虫

jsoup 是一个用于处理真实世界 HTML 的 Java 库。它使用最好的 HTML5 DOM 方法和 CSS 选择器提供了一个非常方便的 API，用于获取 URL 以及提取和操作数据。

pom依赖

<dependency>
      <!-- jsoup HTML parser library @ https://jsoup.org/ -->
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.14.3</version>
    </dependency>
    <!-- slf4j 日志门面 -->
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>1.7.30</version>
    </dependency>

    <!-- logback 日志实现 -->
    <dependency>
      <groupId>ch.qos.logback</groupId>
      <artifactId>logback-classic</artifactId>
      <version>1.2.3</version>
    </dependency>

    <dependency>
      <groupId>cn.hutool</groupId>
      <artifactId>hutool-all</artifactId>
      <version>5.8.0.M4</version>
    </dependency>

    <dependency>
      <groupId>com.aliyun.oss</groupId>
      <artifactId>aliyun-sdk-oss</artifactId>
      <version>3.10.2</version>
    </dependency>

JsoupDemo

public class JsoupDemo {

    public static final Logger log = LoggerFactory.getLogger(JsoupDemo.class);

    public static void main(String[] args) throws IOException {
        Document doc = Jsoup.connect("http://10.1.20.55/gt/").get();
        log.info(doc.title());
        Elements newsHeadlines = doc.select("#column-list ul li");
        for (Element headline : newsHeadlines) {
            String href = headline.children().attr("href");

            Document archivePage = Jsoup.connect("http://10.1.20.55" + href).get();
            log.info("|||---------------------------------------------------|||");
            String title = archivePage.title();
            log.info("《《《 课程专题: {} 》》》", title);

            File f2 = new File("E:\\study\\极客时间\\" + title);
            if (!f2.exists()) {
                f2.mkdir();
            }

            Elements postList = archivePage.select("#column-content-list ul li");
            for (int i = 0; i < postList.size(); i++) {
                String itemHref = postList.get(i).children().attr("href");
                if (StrUtil.isBlank(itemHref)) {
                    continue;
                }
                Document post = Jsoup.connect("http://10.1.20.55" + itemHref).get();
                log.info("=== 文章标题: {} ===", post.title());

//                File f1 = new File("E:\\study\\极客时间\\" + title + "\\" + i + "-" + post.title() + ".md");
//                if (!f1.exists()) {
//                    boolean flag1 = f1.createNewFile();
//                }

                Elements image = post.select("#divimage #img-content");
                String src = image.get(0).attr("src");
                log.info(src);


                String url = AliyunOSSUtils.uploadFile(IdUtil.getSnowflakeNextId() + "_" + StrUtil.subAfter(src, "/", true), "http://10.1.20.55" + src);
                System.out.println(url);
                FileWriter writer = new FileWriter("E:\\study\\极客时间\\" + title + "\\" + i + "-" + post.title() + ".md");
                writer.write("![image-20220505095253102](" + url + ")");
                writer.flush();


            }
        }
    }
}

AliyunOSSUtils

public class AliyunOSSUtils {

    private static final String ENDPOINT = "oss-cn-beijing.aliyuncs.com";
    private static final String ACCESSKEY_ID = "LTAI4FofEpjtgLaKRaRAZqe4";
    private static final String ACCESSKEY_SECRET = "a6u8B0OANXReNcGfyDdLFK97a7xiaK";
    private static final String BUCKET_NAME = "haopeng-jksj";


    public static String uploadFile(String uniqueFileName, String path) throws IOException {
        java.net.URL url = new java.net.URL(path);
        URLConnection conn = url.openConnection();
        InputStream in = conn.getInputStream();
        uploadFile(uniqueFileName, in);
        return "https://" + BUCKET_NAME + "." + ENDPOINT + "/" + uniqueFileName;
    }

    public static void uploadFile(String uniqueFileName, InputStream in) {
        OSS ossClient = new OSSClientBuilder().build("https://" + ENDPOINT, ACCESSKEY_ID, ACCESSKEY_SECRET);
        try {
//            PutObjectRequest putObjectRequest = new PutObjectRequest(BUCKET_NAME, uniqueFileName, in);

            // 如果需要上传时设置存储类型和访问权限，请参考以下示例代码。
            // ObjectMetadata metadata = new ObjectMetadata();
            // metadata.setHeader(OSSHeaders.OSS_STORAGE_CLASS, StorageClass.Standard.toString());
            // metadata.setObjectAcl(CannedAccessControlList.Private);
            // putObjectRequest.setMetadata(metadata);

          ossClient.putObject(BUCKET_NAME, uniqueFileName, in);

        } catch (OSSException oe) {
            System.out.println("Caught an OSSException, which means your request made it to OSS, "
                    + "but was rejected with an error response for some reason.");
            System.out.println("Error Message:" + oe.getErrorMessage());
            System.out.println("Error Code:" + oe.getErrorCode());
            System.out.println("Request ID:" + oe.getRequestId());
            System.out.println("Host ID:" + oe.getHostId());
        } catch (ClientException ce) {
            System.out.println("Caught an ClientException, which means the client encountered "
                    + "a serious internal problem while trying to communicate with OSS, "
                    + "such as not being able to access the network.");
            System.out.println("Error Message:" + ce.getMessage());
        } finally {
            if (ossClient != null) {
                ossClient.shutdown();
            }
        }
    }


}

logback.xml

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <!--    配置集中管理属性, 可以直接使用该属性的值，格式：${name}-->
    <property name="pattern" value="%d{yyyy-MM-dd HH:mm:ss.SSS} [%-5level] %c %M %L [%thread] %m%n"/>
    <!--
    日志输出格式:
        %-5level
        %d{yyyy-MM-dd HH:mm:ss.SSS}日期
        %c类的完整名称
        %M为method
        %L为行号
        %thread线程名称
        %m或者%msg为信息
        %n换行
    -->

    <!--
    格式化输出:
        %d表示日期，
        %thread表示线程名，
        %-5level:级别从左显示5个字符宽度
        %msg:日志消息
        %n是换行符
    -->

    <!--    控制台日志输出的 appender-->

    <!--每个标签的编写都是使用类的 set 方法 如 target 标签对应 setTarget 方法-->
    <appender name="console" class="ch.qos.logback.core.ConsoleAppender">
        <!--    控制输出流对象  默认是 System.out 可以改为 System.err 区别在于：字体颜色不同-->
        <target>System.out</target>
        <!--    日志消息格式配置-->
        <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
            <pattern>${pattern}</pattern>
        </encoder>
    </appender>

    <!--    root logger 配置-->
    <root level="ALL">
        <appender-ref ref="console"></appender-ref>
    </root>

</configuration>