Commit e32bb14498b37f3d9672702996338bae1636cf11

Authored by 盛长浩
1 parent 494c85fb

公安网爬虫代码优化

@@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { @@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
89 String href = element.attr("href"); 89 String href = element.attr("href");
90 String title = element.text(); 90 String title = element.text();
91 91
92 - System.out.println("界面href: " + href);  
93 - System.out.println("界面title: " + title);  
94 -  
95 if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) { 92 if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) {
96 try { 93 try {
97 // 确保URL是绝对路径,并替换域名为IP 94 // 确保URL是绝对路径,并替换域名为IP
@@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { @@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
111 .timeout(TIMEOUT) 108 .timeout(TIMEOUT)
112 .get(); 109 .get();
113 110
  111 + // 去除表头部分 (div.headDPS)
  112 + Elements headDpsElements = detailDoc.select("div.headDPS");
  113 + headDpsElements.remove();
  114 +
  115 + // 去除位置导航部分 (div.position)
  116 + Elements positionElements = detailDoc.select("div.position");
  117 + positionElements.remove();
  118 +
  119 + // 去除底部部分 (div.bottomDPS)
  120 + Elements bottomDpsElements = detailDoc.select("div.bottomDPS");
  121 + bottomDpsElements.remove();
  122 +
  123 + // 补全正文中的下载文件地址
  124 + Elements downloadLinks = detailDoc.select("div.mainsub a[href]");
  125 + for (Element link : downloadLinks) {
  126 + String fileHref = link.attr("href");
  127 + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) {
  128 + // 补全下载文件地址
  129 + String fullUrl = "http://53.1.230.100" +
  130 + (fileHref.startsWith("/") ? "" : "/") +
  131 + fileHref;
  132 + link.attr("href", fullUrl);
  133 + }
  134 + }
  135 +
114 // 去除标题中的访问量数字和括号 136 // 去除标题中的访问量数字和括号
115 String cleanTitle = removeAccessCountFromTitle(title); 137 String cleanTitle = removeAccessCountFromTitle(title);
116 138
@@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { @@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
118 String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; 140 String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
119 System.out.println("生成fileName: " + fileName); 141 System.out.println("生成fileName: " + fileName);
120 142
121 - // 保存HTML内容 143 + // 保存HTML内容(已去除多余部分并补全下载地址)
122 Path filePath = Paths.get(crawlerConfig.getDir(), fileName); 144 Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
123 Files.write(filePath, detailDoc.html().getBytes()); 145 Files.write(filePath, detailDoc.html().getBytes());
124 146