|
|
1
|
+package com.mass.impl;
|
|
|
2
|
+
|
|
|
3
|
+import com.mass.entity.CrawlerConfig;
|
|
|
4
|
+import com.mass.service.PoliceNewsCrawlerService;
|
|
|
5
|
+import org.jsoup.Jsoup;
|
|
|
6
|
+import org.jsoup.nodes.Document;
|
|
|
7
|
+import org.jsoup.nodes.Element;
|
|
|
8
|
+import org.jsoup.select.Elements;
|
|
|
9
|
+import org.springframework.beans.factory.annotation.Autowired;
|
|
|
10
|
+import org.springframework.stereotype.Service;
|
|
|
11
|
+
|
|
|
12
|
+import java.io.IOException;
|
|
|
13
|
+import java.nio.file.Files;
|
|
|
14
|
+import java.nio.file.Path;
|
|
|
15
|
+import java.nio.file.Paths;
|
|
|
16
|
+import java.util.ArrayList;
|
|
|
17
|
+import java.util.List;
|
|
|
18
|
+import java.util.concurrent.atomic.AtomicInteger;
|
|
|
19
|
+import java.util.regex.Matcher;
|
|
|
20
|
+import java.util.regex.Pattern;
|
|
|
21
|
+
|
|
|
22
|
+@Service
|
|
|
23
|
+public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
|
|
|
24
|
+
|
|
|
25
|
+ @Autowired
|
|
|
26
|
+ private CrawlerConfig crawlerConfig;
|
|
|
27
|
+
|
|
|
28
|
+ // 基础URL
|
|
|
29
|
+ //private static final String BASE_URL = "http://www.ah/web/Channel.aspx?chn=547&Page=";
|
|
|
30
|
+ //private static final int TOTAL_PAGES = 31;
|
|
|
31
|
+
|
|
|
32
|
+ // 输出目录
|
|
|
33
|
+ //private static final String OUTPUT_DIR = "crawled_news/";
|
|
|
34
|
+
|
|
|
35
|
+ // 用户代理和超时设置
|
|
|
36
|
+ private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
|
|
|
37
|
+ private static final int TIMEOUT = 10000;
|
|
|
38
|
+
|
|
|
39
|
+ // 存储已爬取的文件列表
|
|
|
40
|
+ private List<String> crawledFiles = new ArrayList<>();
|
|
|
41
|
+
|
|
|
42
|
+ @Override
|
|
|
43
|
+ public String crawlNewsByPage(int pageNum) {
|
|
|
44
|
+ int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
|
|
|
45
|
+ if (pageNum < 1 || pageNum > TOTAL_PAGES) {
|
|
|
46
|
+ return "页码无效,有效范围: 1-" + TOTAL_PAGES;
|
|
|
47
|
+ }
|
|
|
48
|
+
|
|
|
49
|
+ try {
|
|
|
50
|
+ // 创建输出目录
|
|
|
51
|
+ Path outputPath = Paths.get(crawlerConfig.getDir());
|
|
|
52
|
+ if (!Files.exists(outputPath)) {
|
|
|
53
|
+ Files.createDirectories(outputPath);
|
|
|
54
|
+ }
|
|
|
55
|
+
|
|
|
56
|
+ String pageUrl = crawlerConfig.getUrl() + pageNum;
|
|
|
57
|
+ System.out.println("开始爬取页面: " + pageUrl);
|
|
|
58
|
+
|
|
|
59
|
+ // 获取列表页内容
|
|
|
60
|
+ Document listDoc = Jsoup.connect(pageUrl)
|
|
|
61
|
+ .userAgent(USER_AGENT)
|
|
|
62
|
+ .timeout(TIMEOUT)
|
|
|
63
|
+ .get();
|
|
|
64
|
+
|
|
|
65
|
+ // 根据图片信息,直接定位到时政要闻列表区域
|
|
|
66
|
+ Element downlistContainer = listDoc.select("div.downlist").first();
|
|
|
67
|
+ if (downlistContainer == null) {
|
|
|
68
|
+ return "未找到时政要闻列表容器 (div.downlist)";
|
|
|
69
|
+ }
|
|
|
70
|
+
|
|
|
71
|
+ // 移除分页区域,避免爬取分页链接
|
|
|
72
|
+ Elements pageListElements = downlistContainer.select("div.pageList");
|
|
|
73
|
+ pageListElements.remove();
|
|
|
74
|
+
|
|
|
75
|
+ // 提取列表中的新闻链接
|
|
|
76
|
+ Elements newsElements = downlistContainer.select("a[href*=.aspx]");
|
|
|
77
|
+
|
|
|
78
|
+ // 如果没有找到,尝试其他可能的选择器
|
|
|
79
|
+ if (newsElements.isEmpty()) {
|
|
|
80
|
+ newsElements = downlistContainer.select("li a");
|
|
|
81
|
+ }
|
|
|
82
|
+
|
|
|
83
|
+ System.out.println("找到 " + newsElements.size() + " 个新闻链接");
|
|
|
84
|
+
|
|
|
85
|
+ AtomicInteger successCount = new AtomicInteger(0);
|
|
|
86
|
+
|
|
|
87
|
+ // 处理每个新闻链接
|
|
|
88
|
+ for (Element element : newsElements) {
|
|
|
89
|
+ String href = element.attr("href");
|
|
|
90
|
+ String title = element.text();
|
|
|
91
|
+
|
|
|
92
|
+ System.out.println("界面href: " + href);
|
|
|
93
|
+ System.out.println("界面title: " + title);
|
|
|
94
|
+
|
|
|
95
|
+ if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
|
|
|
96
|
+ try {
|
|
|
97
|
+ // 确保URL是绝对路径,并替换域名为IP
|
|
|
98
|
+ if (!href.startsWith("http")) {
|
|
|
99
|
+ // 使用IP地址替换域名
|
|
|
100
|
+ href = "http://53.1.230.100" + (href.startsWith("/") ? "" : "/") + href;
|
|
|
101
|
+ } else {
|
|
|
102
|
+ // 如果已经是完整URL,替换其中的域名部分
|
|
|
103
|
+ href = href.replace("www.ah", "53.1.230.100");
|
|
|
104
|
+ }
|
|
|
105
|
+
|
|
|
106
|
+ System.out.println("生成href: " + href);
|
|
|
107
|
+
|
|
|
108
|
+ // 获取详情页内容
|
|
|
109
|
+ Document detailDoc = Jsoup.connect(href)
|
|
|
110
|
+ .userAgent(USER_AGENT)
|
|
|
111
|
+ .timeout(TIMEOUT)
|
|
|
112
|
+ .get();
|
|
|
113
|
+
|
|
|
114
|
+ // 去除标题中的访问量数字和括号
|
|
|
115
|
+ String cleanTitle = removeAccessCountFromTitle(title);
|
|
|
116
|
+
|
|
|
117
|
+ // 使用清理后的标题作为文件名,过滤非法字符
|
|
|
118
|
+ String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
|
|
|
119
|
+ System.out.println("生成fileName: " + fileName);
|
|
|
120
|
+
|
|
|
121
|
+ // 保存HTML内容
|
|
|
122
|
+ Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
|
|
|
123
|
+ Files.write(filePath, detailDoc.html().getBytes());
|
|
|
124
|
+
|
|
|
125
|
+ // 记录已爬取的文件
|
|
|
126
|
+ crawledFiles.add(fileName);
|
|
|
127
|
+ successCount.incrementAndGet();
|
|
|
128
|
+
|
|
|
129
|
+ System.out.println("已保存: " + fileName);
|
|
|
130
|
+
|
|
|
131
|
+ // 添加延迟避免被封
|
|
|
132
|
+ Thread.sleep(1000);
|
|
|
133
|
+ } catch (Exception e) {
|
|
|
134
|
+ System.err.println("处理新闻失败: " + title + " - " + e.getMessage());
|
|
|
135
|
+ }
|
|
|
136
|
+ }
|
|
|
137
|
+ }
|
|
|
138
|
+
|
|
|
139
|
+ return String.format("第%d页爬取完成,成功保存%d个新闻文件", pageNum, successCount.get());
|
|
|
140
|
+
|
|
|
141
|
+ } catch (IOException e) {
|
|
|
142
|
+ return "爬取页面失败: " + e.getMessage();
|
|
|
143
|
+ }
|
|
|
144
|
+ }
|
|
|
145
|
+
|
|
|
146
|
+ /**
|
|
|
147
|
+ * 从标题中去除访问量数字和括号
|
|
|
148
|
+ * 例如: "习近平回信勉励全国特岗教师代表...(121)" -> "习近平回信勉励全国特岗教师代表..."
|
|
|
149
|
+ */
|
|
|
150
|
+ private String removeAccessCountFromTitle(String title) {
|
|
|
151
|
+ // 使用正则表达式匹配末尾的括号和数字
|
|
|
152
|
+ return title.replaceAll("\\s*\\(\\d+\\)$", "");
|
|
|
153
|
+ }
|
|
|
154
|
+
|
|
|
155
|
+ @Override
|
|
|
156
|
+ public String crawlAllNews() {
|
|
|
157
|
+ StringBuilder result = new StringBuilder();
|
|
|
158
|
+ int totalSuccess = 0;
|
|
|
159
|
+
|
|
|
160
|
+ int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
|
|
|
161
|
+
|
|
|
162
|
+ for (int page = 1; page <= TOTAL_PAGES; page++) {
|
|
|
163
|
+ String pageResult = crawlNewsByPage(page);
|
|
|
164
|
+ result.append(pageResult).append("\n");
|
|
|
165
|
+
|
|
|
166
|
+ // 提取成功数量
|
|
|
167
|
+ Pattern pattern = Pattern.compile("成功保存(\\d+)个新闻文件");
|
|
|
168
|
+ Matcher matcher = pattern.matcher(pageResult);
|
|
|
169
|
+ if (matcher.find()) {
|
|
|
170
|
+ totalSuccess += Integer.parseInt(matcher.group(1));
|
|
|
171
|
+ }
|
|
|
172
|
+
|
|
|
173
|
+ // 添加页面之间的延迟
|
|
|
174
|
+ try {
|
|
|
175
|
+ Thread.sleep(2000);
|
|
|
176
|
+ } catch (InterruptedException e) {
|
|
|
177
|
+ Thread.currentThread().interrupt();
|
|
|
178
|
+ }
|
|
|
179
|
+ }
|
|
|
180
|
+
|
|
|
181
|
+ result.append(String.format("全部爬取完成,共保存%d个新闻文件", totalSuccess));
|
|
|
182
|
+ return result.toString();
|
|
|
183
|
+ }
|
|
|
184
|
+} |