Showing
1 changed file
with
33 additions
and
14 deletions
| @@ -112,24 +112,43 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | @@ -112,24 +112,43 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | ||
| 112 | Elements headDpsElements = detailDoc.select("div.headDPS"); | 112 | Elements headDpsElements = detailDoc.select("div.headDPS"); |
| 113 | headDpsElements.remove(); | 113 | headDpsElements.remove(); |
| 114 | 114 | ||
| 115 | + // 去除底部部分 (div.bottomDPS) | ||
| 116 | + Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | ||
| 117 | + bottomDpsElements.remove(); | ||
| 118 | + | ||
| 115 | // 去除位置导航部分 (div.position) | 119 | // 去除位置导航部分 (div.position) |
| 116 | Elements positionElements = detailDoc.select("div.position"); | 120 | Elements positionElements = detailDoc.select("div.position"); |
| 117 | positionElements.remove(); | 121 | positionElements.remove(); |
| 118 | 122 | ||
| 119 | - // 去除底部部分 (div.bottomDPS) | ||
| 120 | - Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | ||
| 121 | - bottomDpsElements.remove(); | 123 | + // 获取正文区域 |
| 124 | + Element mainsub = detailDoc.select("div.mainsub").first(); | ||
| 125 | + if (mainsub != null) { | ||
| 126 | + // 补全正文中的下载文件地址 | ||
| 127 | + Elements downloadLinks = mainsub.select("a[href]"); | ||
| 128 | + for (Element link : downloadLinks) { | ||
| 129 | + String fileHref = link.attr("href"); | ||
| 130 | + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | ||
| 131 | + // 补全下载文件地址 | ||
| 132 | + String fullUrl = "http://53.1.230.100" + | ||
| 133 | + (fileHref.startsWith("/") ? "" : "/") + | ||
| 134 | + fileHref; | ||
| 135 | + link.attr("href", fullUrl); | ||
| 136 | + System.out.println("补全下载地址: " + fullUrl); | ||
| 137 | + } | ||
| 138 | + } | ||
| 122 | 139 | ||
| 123 | - // 补全正文中的下载文件地址 | ||
| 124 | - Elements downloadLinks = detailDoc.select("div.mainsub a[href]"); | ||
| 125 | - for (Element link : downloadLinks) { | ||
| 126 | - String fileHref = link.attr("href"); | ||
| 127 | - if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | ||
| 128 | - // 补全下载文件地址 | ||
| 129 | - String fullUrl = "http://53.1.230.100" + | ||
| 130 | - (fileHref.startsWith("/") ? "" : "/") + | ||
| 131 | - fileHref; | ||
| 132 | - link.attr("href", fullUrl); | 140 | + // 补全正文中的图片地址 |
| 141 | + Elements images = mainsub.select("img[src]"); | ||
| 142 | + for (Element img : images) { | ||
| 143 | + String imgSrc = img.attr("src"); | ||
| 144 | + if (imgSrc.startsWith("/") || imgSrc.startsWith("uploadfiles/")) { | ||
| 145 | + // 补全图片地址 | ||
| 146 | + String fullUrl = "http://53.1.230.100" + | ||
| 147 | + (imgSrc.startsWith("/") ? "" : "/") + | ||
| 148 | + imgSrc; | ||
| 149 | + img.attr("src", fullUrl); | ||
| 150 | + System.out.println("补全图片地址: " + fullUrl); | ||
| 151 | + } | ||
| 133 | } | 152 | } |
| 134 | } | 153 | } |
| 135 | 154 | ||
| @@ -140,7 +159,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | @@ -140,7 +159,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { | ||
| 140 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; | 159 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; |
| 141 | System.out.println("生成fileName: " + fileName); | 160 | System.out.println("生成fileName: " + fileName); |
| 142 | 161 | ||
| 143 | - // 保存HTML内容(已去除多余部分并补全下载地址) | 162 | + // 保存HTML内容(已去除多余部分并补全下载和图片地址) |
| 144 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); | 163 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); |
| 145 | Files.write(filePath, detailDoc.html().getBytes()); | 164 | Files.write(filePath, detailDoc.html().getBytes()); |
| 146 | 165 |