Showing
1 changed file
with
26 additions
and
4 deletions
| @@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | @@ -89,9 +89,6 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | ||
| 89 | String href = element.attr("href"); | 89 | String href = element.attr("href"); |
| 90 | String title = element.text(); | 90 | String title = element.text(); |
| 91 | 91 | ||
| 92 | - System.out.println("界面href: " + href); | ||
| 93 | - System.out.println("界面title: " + title); | ||
| 94 | - | ||
| 95 | if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) { | 92 | if (href != null && !href.isEmpty() && title != null && !title.isEmpty()) { |
| 96 | try { | 93 | try { |
| 97 | // 确保URL是绝对路径,并替换域名为IP | 94 | // 确保URL是绝对路径,并替换域名为IP |
| @@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | @@ -111,6 +108,31 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | ||
| 111 | .timeout(TIMEOUT) | 108 | .timeout(TIMEOUT) |
| 112 | .get(); | 109 | .get(); |
| 113 | 110 | ||
| 111 | + // 去除表头部分 (div.headDPS) | ||
| 112 | + Elements headDpsElements = detailDoc.select("div.headDPS"); | ||
| 113 | + headDpsElements.remove(); | ||
| 114 | + | ||
| 115 | + // 去除位置导航部分 (div.position) | ||
| 116 | + Elements positionElements = detailDoc.select("div.position"); | ||
| 117 | + positionElements.remove(); | ||
| 118 | + | ||
| 119 | + // 去除底部部分 (div.bottomDPS) | ||
| 120 | + Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | ||
| 121 | + bottomDpsElements.remove(); | ||
| 122 | + | ||
| 123 | + // 补全正文中的下载文件地址 | ||
| 124 | + Elements downloadLinks = detailDoc.select("div.mainsub a[href]"); | ||
| 125 | + for (Element link : downloadLinks) { | ||
| 126 | + String fileHref = link.attr("href"); | ||
| 127 | + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | ||
| 128 | + // 补全下载文件地址 | ||
| 129 | + String fullUrl = "http://53.1.230.100" + | ||
| 130 | + (fileHref.startsWith("/") ? "" : "/") + | ||
| 131 | + fileHref; | ||
| 132 | + link.attr("href", fullUrl); | ||
| 133 | + } | ||
| 134 | + } | ||
| 135 | + | ||
| 114 | // 去除标题中的访问量数字和括号 | 136 | // 去除标题中的访问量数字和括号 |
| 115 | String cleanTitle = removeAccessCountFromTitle(title); | 137 | String cleanTitle = removeAccessCountFromTitle(title); |
| 116 | 138 | ||
| @@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | @@ -118,7 +140,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | ||
| 118 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; | 140 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; |
| 119 | System.out.println("生成fileName: " + fileName); | 141 | System.out.println("生成fileName: " + fileName); |
| 120 | 142 | ||
| 121 | - // 保存HTML内容 | 143 | + // 保存HTML内容(已去除多余部分并补全下载地址) |
| 122 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); | 144 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); |
| 123 | Files.write(filePath, detailDoc.html().getBytes()); | 145 | Files.write(filePath, detailDoc.html().getBytes()); |
| 124 | 146 |