Showing
1 changed file
with
26 additions
and
4 deletions
| ... | ... | @@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { |
| 89 | 89 | String href = element.attr("href"); |
| 90 | 90 | String title = element.text(); |
| 91 | 91 | |
| 92 | - System.out.println("界面href: " + href); | |
| 93 | - System.out.println("界面title: " + title); | |
| 94 | - | |
| 95 | 92 | if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) { |
| 96 | 93 | try { |
| 97 | 94 | // 确保URL是绝对路径,并替换域名为IP |
| ... | ... | @@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { |
| 111 | 108 | .timeout(TIMEOUT) |
| 112 | 109 | .get(); |
| 113 | 110 | |
| 111 | + // 去除表头部分 (div.headDPS) | |
| 112 | + Elements headDpsElements = detailDoc.select("div.headDPS"); | |
| 113 | + headDpsElements.remove(); | |
| 114 | + | |
| 115 | + // 去除位置导航部分 (div.position) | |
| 116 | + Elements positionElements = detailDoc.select("div.position"); | |
| 117 | + positionElements.remove(); | |
| 118 | + | |
| 119 | + // 去除底部部分 (div.bottomDPS) | |
| 120 | + Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | |
| 121 | + bottomDpsElements.remove(); | |
| 122 | + | |
| 123 | + // 补全正文中的下载文件地址 | |
| 124 | + Elements downloadLinks = detailDoc.select("div.mainsub a[href]"); | |
| 125 | + for (Element link : downloadLinks) { | |
| 126 | + String fileHref = link.attr("href"); | |
| 127 | + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | |
| 128 | + // 补全下载文件地址 | |
| 129 | + String fullUrl = "http://53.1.230.100" + | |
| 130 | + (fileHref.startsWith("/") ? "" : "/") + | |
| 131 | + fileHref; | |
| 132 | + link.attr("href", fullUrl); | |
| 133 | + } | |
| 134 | + } | |
| 135 | + | |
| 114 | 136 | // 去除标题中的访问量数字和括号 |
| 115 | 137 | String cleanTitle = removeAccessCountFromTitle(title); |
| 116 | 138 | |
| ... | ... | @@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { |
| 118 | 140 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; |
| 119 | 141 | System.out.println("生成fileName: " + fileName); |
| 120 | 142 | |
| 121 | - // 保存HTML内容 | |
| 143 | + // 保存HTML内容(已去除多余部分并补全下载地址) | |
| 122 | 144 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); |
| 123 | 145 | Files.write(filePath, detailDoc.html().getBytes()); |
| 124 | 146 | ... | ... |