Jsoup爬虫 
      
jsoup 是一个用于处理真实世界 HTML 的 Java 库。它使用最好的 HTML5 DOM 方法和 CSS 选择器提供了一个非常方便的 API,用于获取 URL 以及提取和操作数据。
 
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 <dependency >              <groupId > org.jsoup</groupId >        <artifactId > jsoup</artifactId >        <version > 1.14.3</version >      </dependency >           <dependency >        <groupId > org.slf4j</groupId >        <artifactId > slf4j-api</artifactId >        <version > 1.7.30</version >      </dependency >           <dependency >        <groupId > ch.qos.logback</groupId >        <artifactId > logback-classic</artifactId >        <version > 1.2.3</version >      </dependency >      <dependency >        <groupId > cn.hutool</groupId >        <artifactId > hutool-all</artifactId >        <version > 5.8.0.M4</version >      </dependency >      <dependency >        <groupId > com.aliyun.oss</groupId >        <artifactId > aliyun-sdk-oss</artifactId >        <version > 3.10.2</version >      </dependency >  
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 public  class  JsoupDemo  {    public  static  final  Logger  log  =  LoggerFactory.getLogger(JsoupDemo.class);     public  static  void  main (String[] args)  throws  IOException {         Document  doc  =  Jsoup.connect("http://10.1.20.55/gt/" ).get();         log.info(doc.title());         Elements  newsHeadlines  =  doc.select("#column-list ul li" );         for  (Element headline : newsHeadlines) {             String  href  =  headline.children().attr("href" );             Document  archivePage  =  Jsoup.connect("http://10.1.20.55"  + href).get();             log.info("|||---------------------------------------------------|||" );             String  title  =  archivePage.title();             log.info("《《《 课程专题: {} 》》》" , title);             File  f2  =  new  File ("E:\\study\\极客时间\\"  + title);             if  (!f2.exists()) {                 f2.mkdir();             }             Elements  postList  =  archivePage.select("#column-content-list ul li" );             for  (int  i  =  0 ; i < postList.size(); i++) {                 String  itemHref  =  postList.get(i).children().attr("href" );                 if  (StrUtil.isBlank(itemHref)) {                     continue ;                 }                 Document  post  =  Jsoup.connect("http://10.1.20.55"  + itemHref).get();                 log.info("=== 文章标题: {} ===" , post.title());                 Elements  image  =  post.select("#divimage #img-content" );                 String  src  =  image.get(0 ).attr("src" );                 log.info(src);                 String  url  =  AliyunOSSUtils.uploadFile(IdUtil.getSnowflakeNextId() + "_"  + StrUtil.subAfter(src, "/" , true ), "http://10.1.20.55"  + src);                 System.out.println(url);                 FileWriter  writer  =  new  FileWriter ("E:\\study\\极客时间\\"  + title + "\\"  + i + "-"  + post.title() + ".md" );                 writer.write("" );                 writer.flush();             }         }     } } 
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 public  class  AliyunOSSUtils  {    private  static  final  String  ENDPOINT  =  "oss-cn-beijing.aliyuncs.com" ;     private  static  final  String  ACCESSKEY_ID  =  "LTAI4FofEpjtgLaKRaRAZqe4" ;     private  static  final  String  ACCESSKEY_SECRET  =  "a6u8B0OANXReNcGfyDdLFK97a7xiaK" ;     private  static  final  String  BUCKET_NAME  =  "haopeng-jksj" ;     public  static  String uploadFile (String uniqueFileName, String path)  throws  IOException {         java.net.URL  url  =  new  java .net.URL(path);         URLConnection  conn  =  url.openConnection();         InputStream  in  =  conn.getInputStream();         uploadFile(uniqueFileName, in);         return  "https://"  + BUCKET_NAME + "."  + ENDPOINT + "/"  + uniqueFileName;     }     public  static  void  uploadFile (String uniqueFileName, InputStream in)  {         OSS  ossClient  =  new  OSSClientBuilder ().build("https://"  + ENDPOINT, ACCESSKEY_ID, ACCESSKEY_SECRET);         try  {                                                                            ossClient.putObject(BUCKET_NAME, uniqueFileName, in);         } catch  (OSSException oe) {             System.out.println("Caught an OSSException, which means your request made it to OSS, "                      + "but was rejected with an error response for some reason." );             System.out.println("Error Message:"  + oe.getErrorMessage());             System.out.println("Error Code:"  + oe.getErrorCode());             System.out.println("Request ID:"  + oe.getRequestId());             System.out.println("Host ID:"  + oe.getHostId());         } catch  (ClientException ce) {             System.out.println("Caught an ClientException, which means the client encountered "                      + "a serious internal problem while trying to communicate with OSS, "                      + "such as not being able to access the network." );             System.out.println("Error Message:"  + ce.getMessage());         } finally  {             if  (ossClient != null ) {                 ossClient.shutdown();             }         }     } } 
 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 <?xml version="1.0"  encoding="UTF-8" ?> <configuration >          <property  name ="pattern"  value ="%d{yyyy-MM-dd HH:mm:ss.SSS} [%-5level] %c %M %L [%thread] %m%n" />                          <appender  name ="console"  class ="ch.qos.logback.core.ConsoleAppender" >                   <target > System.out</target >                   <encoder  class ="ch.qos.logback.classic.encoder.PatternLayoutEncoder" >              <pattern > ${pattern}</pattern >          </encoder >      </appender >           <root  level ="ALL" >          <appender-ref  ref ="console" > </appender-ref >      </root >  </configuration >