Showing
1 changed file
with
33 additions
and
14 deletions
| ... | ... | @@ -112,24 +112,43 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { |
| 112 | 112 | Elements headDpsElements = detailDoc.select("div.headDPS"); |
| 113 | 113 | headDpsElements.remove(); |
| 114 | 114 | |
| 115 | + // 去除底部部分 (div.bottomDPS) | |
| 116 | + Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | |
| 117 | + bottomDpsElements.remove(); | |
| 118 | + | |
| 115 | 119 | // 去除位置导航部分 (div.position) |
| 116 | 120 | Elements positionElements = detailDoc.select("div.position"); |
| 117 | 121 | positionElements.remove(); |
| 118 | 122 | |
| 119 | - // 去除底部部分 (div.bottomDPS) | |
| 120 | - Elements bottomDpsElements = detailDoc.select("div.bottomDPS"); | |
| 121 | - bottomDpsElements.remove(); | |
| 123 | + // 获取正文区域 | |
| 124 | + Element mainsub = detailDoc.select("div.mainsub").first(); | |
| 125 | + if (mainsub != null) { | |
| 126 | + // 补全正文中的下载文件地址 | |
| 127 | + Elements downloadLinks = mainsub.select("a[href]"); | |
| 128 | + for (Element link : downloadLinks) { | |
| 129 | + String fileHref = link.attr("href"); | |
| 130 | + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | |
| 131 | + // 补全下载文件地址 | |
| 132 | + String fullUrl = "http://53.1.230.100" + | |
| 133 | + (fileHref.startsWith("/") ? "" : "/") + | |
| 134 | + fileHref; | |
| 135 | + link.attr("href", fullUrl); | |
| 136 | + System.out.println("补全下载地址: " + fullUrl); | |
| 137 | + } | |
| 138 | + } | |
| 122 | 139 | |
| 123 | - // 补全正文中的下载文件地址 | |
| 124 | - Elements downloadLinks = detailDoc.select("div.mainsub a[href]"); | |
| 125 | - for (Element link : downloadLinks) { | |
| 126 | - String fileHref = link.attr("href"); | |
| 127 | - if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) { | |
| 128 | - // 补全下载文件地址 | |
| 129 | - String fullUrl = "http://53.1.230.100" + | |
| 130 | - (fileHref.startsWith("/") ? "" : "/") + | |
| 131 | - fileHref; | |
| 132 | - link.attr("href", fullUrl); | |
| 140 | + // 补全正文中的图片地址 | |
| 141 | + Elements images = mainsub.select("img[src]"); | |
| 142 | + for (Element img : images) { | |
| 143 | + String imgSrc = img.attr("src"); | |
| 144 | + if (imgSrc.startsWith("/") || imgSrc.startsWith("uploadfiles/")) { | |
| 145 | + // 补全图片地址 | |
| 146 | + String fullUrl = "http://53.1.230.100" + | |
| 147 | + (imgSrc.startsWith("/") ? "" : "/") + | |
| 148 | + imgSrc; | |
| 149 | + img.attr("src", fullUrl); | |
| 150 | + System.out.println("补全图片地址: " + fullUrl); | |
| 151 | + } | |
| 133 | 152 | } |
| 134 | 153 | } |
| 135 | 154 | |
| ... | ... | @@ -140,7 +159,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { |
| 140 | 159 | String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; |
| 141 | 160 | System.out.println("生成fileName: " + fileName); |
| 142 | 161 | |
| 143 | - // 保存HTML内容(已去除多余部分并补全下载地址) | |
| 162 | + // 保存HTML内容(已去除多余部分并补全下载和图片地址) | |
| 144 | 163 | Path filePath = Paths.get(crawlerConfig.getDir(), fileName); |
| 145 | 164 | Files.write(filePath, detailDoc.html().getBytes()); |
| 146 | 165 | ... | ... |