PoliceNewsCrawlerServiceImpl.java 7 KB
package com.mass.impl;

import com.mass.entity.CrawlerConfig;
import com.mass.service.PoliceNewsCrawlerService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Service
public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {

    @Autowired
    private CrawlerConfig crawlerConfig;

    // 基础URL
    //private static final String BASE_URL = "http://www.ah/web/Channel.aspx?chn=547&Page=";
    //private static final int TOTAL_PAGES = 31;

    // 输出目录
    //private static final String OUTPUT_DIR = "crawled_news/";

    // 用户代理和超时设置
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
    private static final int TIMEOUT = 10000;

    // 存储已爬取的文件列表
    private List<String> crawledFiles = new ArrayList<>();

    @Override
    public String crawlNewsByPage(int pageNum) {
        int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
        if (pageNum < 1 || pageNum > TOTAL_PAGES) {
            return "页码无效,有效范围: 1-" + TOTAL_PAGES;
        }

        try {
            // 创建输出目录
            Path outputPath = Paths.get(crawlerConfig.getDir());
            if (!Files.exists(outputPath)) {
                Files.createDirectories(outputPath);
            }

            String pageUrl = crawlerConfig.getUrl() + pageNum;
            System.out.println("开始爬取页面: " + pageUrl);

            // 获取列表页内容
            Document listDoc = Jsoup.connect(pageUrl)
                    .userAgent(USER_AGENT)
                    .timeout(TIMEOUT)
                    .get();

            // 根据图片信息,直接定位到时政要闻列表区域
            Element downlistContainer = listDoc.select("div.downlist").first();
            if (downlistContainer == null) {
                return "未找到时政要闻列表容器 (div.downlist)";
            }

            // 移除分页区域,避免爬取分页链接
            Elements pageListElements = downlistContainer.select("div.pageList");
            pageListElements.remove();

            // 提取列表中的新闻链接
            Elements newsElements = downlistContainer.select("a[href*=.aspx]");

            // 如果没有找到,尝试其他可能的选择器
            if (newsElements.isEmpty()) {
                newsElements = downlistContainer.select("li a");
            }

            System.out.println("找到 " + newsElements.size() + " 个新闻链接");

            AtomicInteger successCount = new AtomicInteger(0);

            // 处理每个新闻链接
            for (Element element : newsElements) {
                String href = element.attr("href");
                String title = element.text();

                System.out.println("界面href: " + href);
                System.out.println("界面title: " + title);

                if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
                    try {
                        // 确保URL是绝对路径,并替换域名为IP
                        if (!href.startsWith("http")) {
                            // 使用IP地址替换域名
                            href = "http://53.1.230.100" + (href.startsWith("/") ? "" : "/") + href;
                        } else {
                            // 如果已经是完整URL,替换其中的域名部分
                            href = href.replace("www.ah", "53.1.230.100");
                        }

                        System.out.println("生成href: " + href);

                        // 获取详情页内容
                        Document detailDoc = Jsoup.connect(href)
                                .userAgent(USER_AGENT)
                                .timeout(TIMEOUT)
                                .get();

                        // 去除标题中的访问量数字和括号
                        String cleanTitle = removeAccessCountFromTitle(title);

                        // 使用清理后的标题作为文件名,过滤非法字符
                        String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
                        System.out.println("生成fileName: " + fileName);

                        // 保存HTML内容
                        Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
                        Files.write(filePath, detailDoc.html().getBytes());

                        // 记录已爬取的文件
                        crawledFiles.add(fileName);
                        successCount.incrementAndGet();

                        System.out.println("已保存: " + fileName);

                        // 添加延迟避免被封
                        Thread.sleep(1000);
                    } catch (Exception e) {
                        System.err.println("处理新闻失败: " + title + " - " + e.getMessage());
                    }
                }
            }

            return String.format("第%d页爬取完成,成功保存%d个新闻文件", pageNum, successCount.get());

        } catch (IOException e) {
            return "爬取页面失败: " + e.getMessage();
        }
    }

    /**
     * 从标题中去除访问量数字和括号
     * 例如: "习近平回信勉励全国特岗教师代表...(121)" -> "习近平回信勉励全国特岗教师代表..."
     */
    private String removeAccessCountFromTitle(String title) {
        // 使用正则表达式匹配末尾的括号和数字
        return title.replaceAll("\\s*\\(\\d+\\)$", "");
    }

    @Override
    public String crawlAllNews() {
        StringBuilder result = new StringBuilder();
        int totalSuccess = 0;

        int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());

        for (int page = 1; page <= TOTAL_PAGES; page++) {
            String pageResult = crawlNewsByPage(page);
            result.append(pageResult).append("\n");

            // 提取成功数量
            Pattern pattern = Pattern.compile("成功保存(\\d+)个新闻文件");
            Matcher matcher = pattern.matcher(pageResult);
            if (matcher.find()) {
                totalSuccess += Integer.parseInt(matcher.group(1));
            }

            // 添加页面之间的延迟
            try {
                Thread.sleep(2000);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }

        result.append(String.format("全部爬取完成,共保存%d个新闻文件", totalSuccess));
        return result.toString();
    }
}