Commit e32bb14498b37f3d9672702996338bae1636cf11

Authored by 盛长浩
1 parent 494c85fb

公安网爬虫代码优化

... ... @@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
89 89 String href = element.attr("href");
90 90 String title = element.text();
91 91
92   - System.out.println("界面href: " + href);
93   - System.out.println("界面title: " + title);
94   -
95 92 if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
96 93 try {
97 94 // 确保URL是绝对路径,并替换域名为IP
... ... @@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
111 108 .timeout(TIMEOUT)
112 109 .get();
113 110
  111 + // 去除表头部分 (div.headDPS)
  112 + Elements headDpsElements = detailDoc.select("div.headDPS");
  113 + headDpsElements.remove();
  114 +
  115 + // 去除位置导航部分 (div.position)
  116 + Elements positionElements = detailDoc.select("div.position");
  117 + positionElements.remove();
  118 +
  119 + // 去除底部部分 (div.bottomDPS)
  120 + Elements bottomDpsElements = detailDoc.select("div.bottomDPS");
  121 + bottomDpsElements.remove();
  122 +
  123 + // 补全正文中的下载文件地址
  124 + Elements downloadLinks = detailDoc.select("div.mainsub a[href]");
  125 + for (Element link : downloadLinks) {
  126 + String fileHref = link.attr("href");
  127 + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) {
  128 + // 补全下载文件地址
  129 + String fullUrl = "http://53.1.230.100" +
  130 + (fileHref.startsWith("/") ? "" : "/") +
  131 + fileHref;
  132 + link.attr("href", fullUrl);
  133 + }
  134 + }
  135 +
114 136 // 去除标题中的访问量数字和括号
115 137 String cleanTitle = removeAccessCountFromTitle(title);
116 138
... ... @@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
118 140 String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
119 141 System.out.println("生成fileName: " + fileName);
120 142
121   - // 保存HTML内容
  143 + // 保存HTML内容(已去除多余部分并补全下载地址)
122 144 Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
123 145 Files.write(filePath, detailDoc.html().getBytes());
124 146
... ...