公安网爬虫代码添加

盛长浩
1 parent 4523ff0e
Showing 8 changed files with 264 additions and 1 deletions
Dockerfile
pom.xml
src/main/java/com/mass/controller/CrawlerController.java
src/main/java/com/mass/entity/CrawlerConfig.java
src/main/java/com/mass/impl/PoliceNewsCrawlerServiceImpl.java
src/main/java/com/mass/service/PoliceNewsCrawlerService.java
src/main/java/com/mass/service/UserSyncService.java
src/main/resources/application.yml
--- a/Dockerfile
View file @494c85f
+++ b/Dockerfile
View file @494c85f
 FROM openjdk:8-jdk-alpine
+# 声明可覆盖的环境变量
+ENV BASE_URL="http://www.ah/web/Channel.aspx?chn=547&Page=" \
+    TOTAL_PAGES="31" \
+    OUTPUT_DIR="/yuanjing/cuai/mSync/files/"
+
 #WORKDIR /app
 COPY target/massSync-1.0-SNAPSHOT.jar /mass-sync.jar
--- a/pom.xml
View file @494c85f
+++ b/pom.xml
View file @494c85f
@@ -70,6 +70,12 @@
             <artifactId>fastjson</artifactId>
             <version>1.2.37</version>
         </dependency>
+
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.15.3</version>
+        </dependency>
     </dependencies>
     <build>
--- a/src/main/java/com/mass/controller/CrawlerController.java 0 → 100644
View file @494c85f
+++ b/src/main/java/com/mass/controller/CrawlerController.java 0 → 100644
View file @494c85f
+package com.mass.controller;
+
+import com.mass.service.PoliceNewsCrawlerService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.PathVariable;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+@RestController
+@RequestMapping("/api/crawler")
+public class CrawlerController {
+
+    @Autowired
+    private PoliceNewsCrawlerService crawlerService;
+
+    @GetMapping("/page/{pageNum}")
+    public String crawlPage(@PathVariable int pageNum) {
+        return crawlerService.crawlNewsByPage(pageNum);
+    }
+
+    @GetMapping("/all")
+    public String crawlAll() {
+        return crawlerService.crawlAllNews();
+    }
+}
--- a/src/main/java/com/mass/entity/CrawlerConfig.java 0 → 100644
View file @494c85f
+++ b/src/main/java/com/mass/entity/CrawlerConfig.java 0 → 100644
View file @494c85f
+package com.mass.entity;
+
+import lombok.Data;
+import org.springframework.boot.context.properties.ConfigurationProperties;
+import org.springframework.stereotype.Component;
+
+@ConfigurationProperties(prefix = "crawler.base")
+@Component
+@Data  // Lombok 自动生成 getter/setter
+public class CrawlerConfig {
+    private String url;
+    private String page;
+    private String dir;
+}
--- a/src/main/java/com/mass/impl/PoliceNewsCrawlerServiceImpl.java 0 → 100644
View file @494c85f
+++ b/src/main/java/com/mass/impl/PoliceNewsCrawlerServiceImpl.java 0 → 100644
View file @494c85f
+package com.mass.impl;
+
+import com.mass.entity.CrawlerConfig;
+import com.mass.service.PoliceNewsCrawlerService;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@Service
+public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
+
+    @Autowired
+    private CrawlerConfig crawlerConfig;
+
+    // 基础URL
+    //private static final String BASE_URL = "http://www.ah/web/Channel.aspx?chn=547&Page=";
+    //private static final int TOTAL_PAGES = 31;
+
+    // 输出目录
+    //private static final String OUTPUT_DIR = "crawled_news/";
+
+    // 用户代理和超时设置
+    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
+    private static final int TIMEOUT = 10000;
+
+    // 存储已爬取的文件列表
+    private List<String> crawledFiles = new ArrayList<>();
+
+    @Override
+    public String crawlNewsByPage(int pageNum) {
+        int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
+        if (pageNum < 1 || pageNum > TOTAL_PAGES) {
+            return "页码无效，有效范围: 1-" + TOTAL_PAGES;
+        }
+
+        try {
+            // 创建输出目录
+            Path outputPath = Paths.get(crawlerConfig.getDir());
+            if (!Files.exists(outputPath)) {
+                Files.createDirectories(outputPath);
+            }
+
+            String pageUrl = crawlerConfig.getUrl() + pageNum;
+            System.out.println("开始爬取页面: " + pageUrl);
+
+            // 获取列表页内容
+            Document listDoc = Jsoup.connect(pageUrl)
+                    .userAgent(USER_AGENT)
+                    .timeout(TIMEOUT)
+                    .get();
+
+            // 根据图片信息，直接定位到时政要闻列表区域
+            Element downlistContainer = listDoc.select("div.downlist").first();
+            if (downlistContainer == null) {
+                return "未找到时政要闻列表容器 (div.downlist)";
+            }
+
+            // 移除分页区域，避免爬取分页链接
+            Elements pageListElements = downlistContainer.select("div.pageList");
+            pageListElements.remove();
+
+            // 提取列表中的新闻链接
+            Elements newsElements = downlistContainer.select("a[href*=.aspx]");
+
+            // 如果没有找到，尝试其他可能的选择器
+            if (newsElements.isEmpty()) {
+                newsElements = downlistContainer.select("li a");
+            }
+
+            System.out.println("找到 " + newsElements.size() + " 个新闻链接");
+
+            AtomicInteger successCount = new AtomicInteger(0);
+
+            // 处理每个新闻链接
+            for (Element element : newsElements) {
+                String href = element.attr("href");
+                String title = element.text();
+
+                System.out.println("界面href: " + href);
+                System.out.println("界面title: " + title);
+
+                if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
+                    try {
+                        // 确保URL是绝对路径，并替换域名为IP
+                        if (!href.startsWith("http")) {
+                            // 使用IP地址替换域名
+                            href = "http://53.1.230.100" + (href.startsWith("/") ? "" : "/") + href;
+                        } else {
+                            // 如果已经是完整URL，替换其中的域名部分
+                            href = href.replace("www.ah", "53.1.230.100");
+                        }
+
+                        System.out.println("生成href: " + href);
+
+                        // 获取详情页内容
+                        Document detailDoc = Jsoup.connect(href)
+                                .userAgent(USER_AGENT)
+                                .timeout(TIMEOUT)
+                                .get();
+
+                        // 去除标题中的访问量数字和括号
+                        String cleanTitle = removeAccessCountFromTitle(title);
+
+                        // 使用清理后的标题作为文件名，过滤非法字符
+                        String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
+                        System.out.println("生成fileName: " + fileName);
+
+                        // 保存HTML内容
+                        Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
+                        Files.write(filePath, detailDoc.html().getBytes());
+
+                        // 记录已爬取的文件
+                        crawledFiles.add(fileName);
+                        successCount.incrementAndGet();
+
+                        System.out.println("已保存: " + fileName);
+
+                        // 添加延迟避免被封
+                        Thread.sleep(1000);
+                    } catch (Exception e) {
+                        System.err.println("处理新闻失败: " + title + " - " + e.getMessage());
+                    }
+                }
+            }
+
+            return String.format("第%d页爬取完成，成功保存%d个新闻文件", pageNum, successCount.get());
+
+        } catch (IOException e) {
+            return "爬取页面失败: " + e.getMessage();
+        }
+    }
+
+    /**
+     * 从标题中去除访问量数字和括号
+     * 例如: "习近平回信勉励全国特岗教师代表...(121)" -> "习近平回信勉励全国特岗教师代表..."
+     */
+    private String removeAccessCountFromTitle(String title) {
+        // 使用正则表达式匹配末尾的括号和数字
+        return title.replaceAll("\\s*\\(\\d+\\)$", "");
+    }
+
+    @Override
+    public String crawlAllNews() {
+        StringBuilder result = new StringBuilder();
+        int totalSuccess = 0;
+
+        int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
+
+        for (int page = 1; page <= TOTAL_PAGES; page++) {
+            String pageResult = crawlNewsByPage(page);
+            result.append(pageResult).append("\n");
+
+            // 提取成功数量
+            Pattern pattern = Pattern.compile("成功保存(\\d+)个新闻文件");
+            Matcher matcher = pattern.matcher(pageResult);
+            if (matcher.find()) {
+                totalSuccess += Integer.parseInt(matcher.group(1));
+            }
+
+            // 添加页面之间的延迟
+            try {
+                Thread.sleep(2000);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+
+        result.append(String.format("全部爬取完成，共保存%d个新闻文件", totalSuccess));
+        return result.toString();
+    }
+}
--- a/src/main/java/com/mass/service/PoliceNewsCrawlerService.java 0 → 100644
View file @494c85f
+++ b/src/main/java/com/mass/service/PoliceNewsCrawlerService.java 0 → 100644
View file @494c85f
+package com.mass.service;
+
+import java.util.List;
+
+/**
+ * 公安网站时政要闻爬虫服务接口
+ */
+public interface PoliceNewsCrawlerService {
+
+    /**
+     * 爬取指定页面的时政要闻列表
+     * @param pageNum 页码
+     * @return 爬取结果信息
+     */
+    String crawlNewsByPage(int pageNum);
+
+    /**
+     * 爬取所有页面的时政要闻
+     * @return 爬取结果信息
+     */
+    String crawlAllNews();
+}
--- a/src/main/java/com/mass/service/UserSyncService.java
View file @494c85f
+++ b/src/main/java/com/mass/service/UserSyncService.java
View file @494c85f
@@ -45,7 +45,7 @@ public class UserSyncService {
     private static final String USER_DATE_FILE = "userDate.txt";
     private static final String ORG_DATE_FILE = "orgDate.txt";
-    @Scheduled(cron = "*/5 * * * * ?")
+    //@Scheduled(cron = "*/5 * * * * ?")
     public void snyc() {
         log.info( Thread.currentThread().getName()+"====test");
     }
--- a/src/main/resources/application.yml
View file @494c85f
+++ b/src/main/resources/application.yml
View file @494c85f
@@ -23,3 +23,9 @@ mass:
   orgListUpdateUrl: /UUDB/org/updateList
   tmpFilePath: /tmp/syncFile/
+crawler:
+  base:
+    url: ${BASE_URL:http://www.ah/web/Channel.aspx?chn=547&Page=}
+    page: ${TOTAL_PAGES:31}
+    dir: ${OUTPUT_DIR:/yuanjing/cuai/mSync/files/}
+