PoliceNewsCrawlerServiceImpl.java
7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
package com.mass.impl;
import com.mass.entity.CrawlerConfig;
import com.mass.service.PoliceNewsCrawlerService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
@Autowired
private CrawlerConfig crawlerConfig;
// 基础URL
//private static final String BASE_URL = "http://www.ah/web/Channel.aspx?chn=547&Page=";
//private static final int TOTAL_PAGES = 31;
// 输出目录
//private static final String OUTPUT_DIR = "crawled_news/";
// 用户代理和超时设置
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36";
private static final int TIMEOUT = 10000;
// 存储已爬取的文件列表
private List<String> crawledFiles = new ArrayList<>();
@Override
public String crawlNewsByPage(int pageNum) {
int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
if (pageNum < 1 || pageNum > TOTAL_PAGES) {
return "页码无效,有效范围: 1-" + TOTAL_PAGES;
}
try {
// 创建输出目录
Path outputPath = Paths.get(crawlerConfig.getDir());
if (!Files.exists(outputPath)) {
Files.createDirectories(outputPath);
}
String pageUrl = crawlerConfig.getUrl() + pageNum;
System.out.println("开始爬取页面: " + pageUrl);
// 获取列表页内容
Document listDoc = Jsoup.connect(pageUrl)
.userAgent(USER_AGENT)
.timeout(TIMEOUT)
.get();
// 根据图片信息,直接定位到时政要闻列表区域
Element downlistContainer = listDoc.select("div.downlist").first();
if (downlistContainer == null) {
return "未找到时政要闻列表容器 (div.downlist)";
}
// 移除分页区域,避免爬取分页链接
Elements pageListElements = downlistContainer.select("div.pageList");
pageListElements.remove();
// 提取列表中的新闻链接
Elements newsElements = downlistContainer.select("a[href*=.aspx]");
// 如果没有找到,尝试其他可能的选择器
if (newsElements.isEmpty()) {
newsElements = downlistContainer.select("li a");
}
System.out.println("找到 " + newsElements.size() + " 个新闻链接");
AtomicInteger successCount = new AtomicInteger(0);
// 处理每个新闻链接
for (Element element : newsElements) {
String href = element.attr("href");
String title = element.text();
System.out.println("界面href: " + href);
System.out.println("界面title: " + title);
if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
try {
// 确保URL是绝对路径,并替换域名为IP
if (!href.startsWith("http")) {
// 使用IP地址替换域名
href = "http://53.1.230.100" + (href.startsWith("/") ? "" : "/") + href;
} else {
// 如果已经是完整URL,替换其中的域名部分
href = href.replace("www.ah", "53.1.230.100");
}
System.out.println("生成href: " + href);
// 获取详情页内容
Document detailDoc = Jsoup.connect(href)
.userAgent(USER_AGENT)
.timeout(TIMEOUT)
.get();
// 去除标题中的访问量数字和括号
String cleanTitle = removeAccessCountFromTitle(title);
// 使用清理后的标题作为文件名,过滤非法字符
String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
System.out.println("生成fileName: " + fileName);
// 保存HTML内容
Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
Files.write(filePath, detailDoc.html().getBytes());
// 记录已爬取的文件
crawledFiles.add(fileName);
successCount.incrementAndGet();
System.out.println("已保存: " + fileName);
// 添加延迟避免被封
Thread.sleep(1000);
} catch (Exception e) {
System.err.println("处理新闻失败: " + title + " - " + e.getMessage());
}
}
}
return String.format("第%d页爬取完成,成功保存%d个新闻文件", pageNum, successCount.get());
} catch (IOException e) {
return "爬取页面失败: " + e.getMessage();
}
}
/**
* 从标题中去除访问量数字和括号
* 例如: "习近平回信勉励全国特岗教师代表...(121)" -> "习近平回信勉励全国特岗教师代表..."
*/
private String removeAccessCountFromTitle(String title) {
// 使用正则表达式匹配末尾的括号和数字
return title.replaceAll("\\s*\\(\\d+\\)$", "");
}
@Override
public String crawlAllNews() {
StringBuilder result = new StringBuilder();
int totalSuccess = 0;
int TOTAL_PAGES = Integer.parseInt(crawlerConfig.getPage());
for (int page = 1; page <= TOTAL_PAGES; page++) {
String pageResult = crawlNewsByPage(page);
result.append(pageResult).append("\n");
// 提取成功数量
Pattern pattern = Pattern.compile("成功保存(\\d+)个新闻文件");
Matcher matcher = pattern.matcher(pageResult);
if (matcher.find()) {
totalSuccess += Integer.parseInt(matcher.group(1));
}
// 添加页面之间的延迟
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
result.append(String.format("全部爬取完成,共保存%d个新闻文件", totalSuccess));
return result.toString();
}
}