Jsoup爬虫
jsoup 是一个用于处理真实世界 HTML 的 Java 库。它使用最好的 HTML5 DOM 方法和 CSS 选择器提供了一个非常方便的 API,用于获取 URL 以及提取和操作数据。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 <dependency > <groupId > org.jsoup</groupId > <artifactId > jsoup</artifactId > <version > 1.14.3</version > </dependency > <dependency > <groupId > org.slf4j</groupId > <artifactId > slf4j-api</artifactId > <version > 1.7.30</version > </dependency > <dependency > <groupId > ch.qos.logback</groupId > <artifactId > logback-classic</artifactId > <version > 1.2.3</version > </dependency > <dependency > <groupId > cn.hutool</groupId > <artifactId > hutool-all</artifactId > <version > 5.8.0.M4</version > </dependency > <dependency > <groupId > com.aliyun.oss</groupId > <artifactId > aliyun-sdk-oss</artifactId > <version > 3.10.2</version > </dependency >
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 public class JsoupDemo { public static final Logger log = LoggerFactory.getLogger(JsoupDemo.class); public static void main (String[] args) throws IOException { Document doc = Jsoup.connect("http://10.1.20.55/gt/" ).get(); log.info(doc.title()); Elements newsHeadlines = doc.select("#column-list ul li" ); for (Element headline : newsHeadlines) { String href = headline.children().attr("href" ); Document archivePage = Jsoup.connect("http://10.1.20.55" + href).get(); log.info("|||---------------------------------------------------|||" ); String title = archivePage.title(); log.info("《《《 课程专题: {} 》》》" , title); File f2 = new File ("E:\\study\\极客时间\\" + title); if (!f2.exists()) { f2.mkdir(); } Elements postList = archivePage.select("#column-content-list ul li" ); for (int i = 0 ; i < postList.size(); i++) { String itemHref = postList.get(i).children().attr("href" ); if (StrUtil.isBlank(itemHref)) { continue ; } Document post = Jsoup.connect("http://10.1.20.55" + itemHref).get(); log.info("=== 文章标题: {} ===" , post.title()); Elements image = post.select("#divimage #img-content" ); String src = image.get(0 ).attr("src" ); log.info(src); String url = AliyunOSSUtils.uploadFile(IdUtil.getSnowflakeNextId() + "_" + StrUtil.subAfter(src, "/" , true ), "http://10.1.20.55" + src); System.out.println(url); FileWriter writer = new FileWriter ("E:\\study\\极客时间\\" + title + "\\" + i + "-" + post.title() + ".md" ); writer.write("" ); writer.flush(); } } } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 public class AliyunOSSUtils { private static final String ENDPOINT = "oss-cn-beijing.aliyuncs.com" ; private static final String ACCESSKEY_ID = "LTAI4FofEpjtgLaKRaRAZqe4" ; private static final String ACCESSKEY_SECRET = "a6u8B0OANXReNcGfyDdLFK97a7xiaK" ; private static final String BUCKET_NAME = "haopeng-jksj" ; public static String uploadFile (String uniqueFileName, String path) throws IOException { java.net.URL url = new java .net.URL(path); URLConnection conn = url.openConnection(); InputStream in = conn.getInputStream(); uploadFile(uniqueFileName, in); return "https://" + BUCKET_NAME + "." + ENDPOINT + "/" + uniqueFileName; } public static void uploadFile (String uniqueFileName, InputStream in) { OSS ossClient = new OSSClientBuilder ().build("https://" + ENDPOINT, ACCESSKEY_ID, ACCESSKEY_SECRET); try { ossClient.putObject(BUCKET_NAME, uniqueFileName, in); } catch (OSSException oe) { System.out.println("Caught an OSSException, which means your request made it to OSS, " + "but was rejected with an error response for some reason." ); System.out.println("Error Message:" + oe.getErrorMessage()); System.out.println("Error Code:" + oe.getErrorCode()); System.out.println("Request ID:" + oe.getRequestId()); System.out.println("Host ID:" + oe.getHostId()); } catch (ClientException ce) { System.out.println("Caught an ClientException, which means the client encountered " + "a serious internal problem while trying to communicate with OSS, " + "such as not being able to access the network." ); System.out.println("Error Message:" + ce.getMessage()); } finally { if (ossClient != null ) { ossClient.shutdown(); } } } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 <?xml version="1.0" encoding="UTF-8" ?> <configuration > <property name ="pattern" value ="%d{yyyy-MM-dd HH:mm:ss.SSS} [%-5level] %c %M %L [%thread] %m%n" /> <appender name ="console" class ="ch.qos.logback.core.ConsoleAppender" > <target > System.out</target > <encoder class ="ch.qos.logback.classic.encoder.PatternLayoutEncoder" > <pattern > ${pattern}</pattern > </encoder > </appender > <root level ="ALL" > <appender-ref ref ="console" > </appender-ref > </root > </configuration >