Commit 494c85fb770bdeeb0b7ed9f22f48a3f8f68106a4

Authored by 盛长浩
1 parent 4523ff0e

公安网爬虫代码添加

1 FROM openjdk:8-jdk-alpine 1 FROM openjdk:8-jdk-alpine
2 2
  3 +# 声明可覆盖的环境变量
  4 +ENV BASE_URL="http://www.ah/web/Channel.aspx?chn=547&Page=" \
  5 + TOTAL_PAGES="31" \
  6 + OUTPUT_DIR="/yuanjing/cuai/mSync/files/"
  7 +
3 #WORKDIR /app 8 #WORKDIR /app
4 9
5 COPY target/massSync-1.0-SNAPSHOT.jar /mass-sync.jar 10 COPY target/massSync-1.0-SNAPSHOT.jar /mass-sync.jar
@@ -70,6 +70,12 @@ @@ -70,6 +70,12 @@
70 <artifactId>fastjson</artifactId> 70 <artifactId>fastjson</artifactId>
71 <version>1.2.37</version> 71 <version>1.2.37</version>
72 </dependency> 72 </dependency>
  73 +
  74 + <dependency>
  75 + <groupId>org.jsoup</groupId>
  76 + <artifactId>jsoup</artifactId>
  77 + <version>1.15.3</version>
  78 + </dependency>
73 </dependencies> 79 </dependencies>
74 80
75 <build> 81 <build>
  1 +package com.mass.controller;
  2 +
  3 +import com.mass.service.PoliceNewsCrawlerService;
  4 +import org.springframework.beans.factory.annotation.Autowired;
  5 +import org.springframework.web.bind.annotation.GetMapping;
  6 +import org.springframework.web.bind.annotation.PathVariable;
  7 +import org.springframework.web.bind.annotation.RequestMapping;
  8 +import org.springframework.web.bind.annotation.RestController;
  9 +
  10 +@RestController
  11 +@RequestMapping("/api/crawler")
  12 +public class CrawlerController {
  13 +
  14 + @Autowired
  15 + private PoliceNewsCrawlerService crawlerService;
  16 +
  17 + @GetMapping("/page/{pageNum}")
  18 + public String crawlPage(@PathVariable int pageNum) {
  19 + return crawlerService.crawlNewsByPage(pageNum);
  20 + }
  21 +
  22 + @GetMapping("/all")
  23 + public String crawlAll() {
  24 + return crawlerService.crawlAllNews();
  25 + }
  26 +}
  1 +package com.mass.entity;
  2 +
  3 +import lombok.Data;
  4 +import org.springframework.boot.context.properties.ConfigurationProperties;
  5 +import org.springframework.stereotype.Component;
  6 +
  7 +@ConfigurationProperties(prefix = "crawler.base")
  8 +@Component
  9 +@Data // Lombok 自动生成 getter/setter
  10 +public class CrawlerConfig {
  11 + private String url;
  12 + private String page;
  13 + private String dir;
  14 +}
  1 +package com.mass.impl;
  2 +
  3 +import com.mass.entity.CrawlerConfig;
  4 +import com.mass.service.PoliceNewsCrawlerService;
  5 +import org.jsoup.Jsoup;
  6 +import org.jsoup.nodes.Document;
  7 +import org.jsoup.nodes.Element;
  8 +import org.jsoup.select.Elements;
  9 +import org.springframework.beans.factory.annotation.Autowired;
  10 +import org.springframework.stereotype.Service;
  11 +
  12 +import java.io.IOException;
  13 +import java.nio.file.Files;
  14 +import java.nio.file.Path;
  15 +import java.nio.file.Paths;
  16 +import java.util.ArrayList;
  17 +import java.util.List;
  18 +import java.util.concurrent.atomic.AtomicInteger;
  19 +import java.util.regex.Matcher;
  20 +import java.util.regex.Pattern;
  21 +
  22 +@Service
  23 +public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
  24 +
  25 + @Autowired
  26 + private CrawlerConfig crawlerConfig;
  27 +
  28 + // 基础URL
  29 + //private static final String BASE_URL = "http://www.ah/web/Channel.aspx?chn=547&Page=";
  30 + //private static final int TOTAL_PAGES = 31;
  31 +
  32 + // 输出目录
  33 + //private static final String OUTPUT_DIR = "crawled_news/";
  34 +
  35 + // 用户代理和超时设置
  36 + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
  37 + private static final int TIMEOUT = 10000;
  38 +
  39 + // 存储已爬取的文件列表
  40 + private List<String> crawledFiles = new ArrayList<>();
  41 +
  42 + @Override
  43 + public String crawlNewsByPage(int pageNum) {
  44 + int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
  45 + if (pageNum < 1 || pageNum > TOTAL_PAGES) {
  46 + return "页码无效,有效范围: 1-" + TOTAL_PAGES;
  47 + }
  48 +
  49 + try {
  50 + // 创建输出目录
  51 + Path outputPath = Paths.get(crawlerConfig.getDir());
  52 + if (!Files.exists(outputPath)) {
  53 + Files.createDirectories(outputPath);
  54 + }
  55 +
  56 + String pageUrl = crawlerConfig.getUrl() + pageNum;
  57 + System.out.println("开始爬取页面: " + pageUrl);
  58 +
  59 + // 获取列表页内容
  60 + Document listDoc = Jsoup.connect(pageUrl)
  61 + .userAgent(USER_AGENT)
  62 + .timeout(TIMEOUT)
  63 + .get();
  64 +
  65 + // 根据图片信息,直接定位到时政要闻列表区域
  66 + Element downlistContainer = listDoc.select("div.downlist").first();
  67 + if (downlistContainer == null) {
  68 + return "未找到时政要闻列表容器 (div.downlist)";
  69 + }
  70 +
  71 + // 移除分页区域,避免爬取分页链接
  72 + Elements pageListElements = downlistContainer.select("div.pageList");
  73 + pageListElements.remove();
  74 +
  75 + // 提取列表中的新闻链接
  76 + Elements newsElements = downlistContainer.select("a[href*=.aspx]");
  77 +
  78 + // 如果没有找到,尝试其他可能的选择器
  79 + if (newsElements.isEmpty()) {
  80 + newsElements = downlistContainer.select("li a");
  81 + }
  82 +
  83 + System.out.println("找到 " + newsElements.size() + " 个新闻链接");
  84 +
  85 + AtomicInteger successCount = new AtomicInteger(0);
  86 +
  87 + // 处理每个新闻链接
  88 + for (Element element : newsElements) {
  89 + String href = element.attr("href");
  90 + String title = element.text();
  91 +
  92 + System.out.println("界面href: " + href);
  93 + System.out.println("界面title: " + title);
  94 +
  95 + if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
  96 + try {
  97 + // 确保URL是绝对路径,并替换域名为IP
  98 + if (!href.startsWith("http")) {
  99 + // 使用IP地址替换域名
  100 + href = "http://53.1.230.100" + (href.startsWith("/") ? "" : "/") + href;
  101 + } else {
  102 + // 如果已经是完整URL,替换其中的域名部分
  103 + href = href.replace("www.ah", "53.1.230.100");
  104 + }
  105 +
  106 + System.out.println("生成href: " + href);
  107 +
  108 + // 获取详情页内容
  109 + Document detailDoc = Jsoup.connect(href)
  110 + .userAgent(USER_AGENT)
  111 + .timeout(TIMEOUT)
  112 + .get();
  113 +
  114 + // 去除标题中的访问量数字和括号
  115 + String cleanTitle = removeAccessCountFromTitle(title);
  116 +
  117 + // 使用清理后的标题作为文件名,过滤非法字符
  118 + String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
  119 + System.out.println("生成fileName: " + fileName);
  120 +
  121 + // 保存HTML内容
  122 + Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
  123 + Files.write(filePath, detailDoc.html().getBytes());
  124 +
  125 + // 记录已爬取的文件
  126 + crawledFiles.add(fileName);
  127 + successCount.incrementAndGet();
  128 +
  129 + System.out.println("已保存: " + fileName);
  130 +
  131 + // 添加延迟避免被封
  132 + Thread.sleep(1000);
  133 + } catch (Exception e) {
  134 + System.err.println("处理新闻失败: " + title + " - " + e.getMessage());
  135 + }
  136 + }
  137 + }
  138 +
  139 + return String.format("第%d页爬取完成,成功保存%d个新闻文件", pageNum, successCount.get());
  140 +
  141 + } catch (IOException e) {
  142 + return "爬取页面失败: " + e.getMessage();
  143 + }
  144 + }
  145 +
  146 + /**
  147 + * 从标题中去除访问量数字和括号
  148 + * 例如: "习近平回信勉励全国特岗教师代表...(121)" -> "习近平回信勉励全国特岗教师代表..."
  149 + */
  150 + private String removeAccessCountFromTitle(String title) {
  151 + // 使用正则表达式匹配末尾的括号和数字
  152 + return title.replaceAll("\\s*\\(\\d+\\)$", "");
  153 + }
  154 +
  155 + @Override
  156 + public String crawlAllNews() {
  157 + StringBuilder result = new StringBuilder();
  158 + int totalSuccess = 0;
  159 +
  160 + int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
  161 +
  162 + for (int page = 1; page <= TOTAL_PAGES; page++) {
  163 + String pageResult = crawlNewsByPage(page);
  164 + result.append(pageResult).append("\n");
  165 +
  166 + // 提取成功数量
  167 + Pattern pattern = Pattern.compile("成功保存(\\d+)个新闻文件");
  168 + Matcher matcher = pattern.matcher(pageResult);
  169 + if (matcher.find()) {
  170 + totalSuccess += Integer.parseInt(matcher.group(1));
  171 + }
  172 +
  173 + // 添加页面之间的延迟
  174 + try {
  175 + Thread.sleep(2000);
  176 + } catch (InterruptedException e) {
  177 + Thread.currentThread().interrupt();
  178 + }
  179 + }
  180 +
  181 + result.append(String.format("全部爬取完成,共保存%d个新闻文件", totalSuccess));
  182 + return result.toString();
  183 + }
  184 +}
  1 +package com.mass.service;
  2 +
  3 +import java.util.List;
  4 +
  5 +/**
  6 + * 公安网站时政要闻爬虫服务接口
  7 + */
  8 +public interface PoliceNewsCrawlerService {
  9 +
  10 + /**
  11 + * 爬取指定页面的时政要闻列表
  12 + * @param pageNum 页码
  13 + * @return 爬取结果信息
  14 + */
  15 + String crawlNewsByPage(int pageNum);
  16 +
  17 + /**
  18 + * 爬取所有页面的时政要闻
  19 + * @return 爬取结果信息
  20 + */
  21 + String crawlAllNews();
  22 +}
@@ -45,7 +45,7 @@ public class UserSyncService { @@ -45,7 +45,7 @@ public class UserSyncService {
45 private static final String USER_DATE_FILE = "userDate.txt"; 45 private static final String USER_DATE_FILE = "userDate.txt";
46 private static final String ORG_DATE_FILE = "orgDate.txt"; 46 private static final String ORG_DATE_FILE = "orgDate.txt";
47 47
48 - @Scheduled(cron = "*/5 * * * * ?") 48 + //@Scheduled(cron = "*/5 * * * * ?")
49 public void snyc() { 49 public void snyc() {
50 log.info( Thread.currentThread().getName()+"====test"); 50 log.info( Thread.currentThread().getName()+"====test");
51 } 51 }
@@ -23,3 +23,9 @@ mass: @@ -23,3 +23,9 @@ mass:
23 orgListUpdateUrl: /UUDB/org/updateList 23 orgListUpdateUrl: /UUDB/org/updateList
24 tmpFilePath: /tmp/syncFile/ 24 tmpFilePath: /tmp/syncFile/
25 25
  26 +crawler:
  27 + base:
  28 + url: ${BASE_URL:http://www.ah/web/Channel.aspx?chn=547&Page=}
  29 + page: ${TOTAL_PAGES:31}
  30 + dir: ${OUTPUT_DIR:/yuanjing/cuai/mSync/files/}
  31 +