Commit 4a6c5559f6aab1040097e0a1ef0c84d6b0ce8505

Authored by 盛长浩
1 parent e32bb144

公安网爬虫代码优化1

@@ -112,24 +112,43 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { @@ -112,24 +112,43 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
112 Elements headDpsElements = detailDoc.select("div.headDPS"); 112 Elements headDpsElements = detailDoc.select("div.headDPS");
113 headDpsElements.remove(); 113 headDpsElements.remove();
114 114
  115 + // 去除底部部分 (div.bottomDPS)
  116 + Elements bottomDpsElements = detailDoc.select("div.bottomDPS");
  117 + bottomDpsElements.remove();
  118 +
115 // 去除位置导航部分 (div.position) 119 // 去除位置导航部分 (div.position)
116 Elements positionElements = detailDoc.select("div.position"); 120 Elements positionElements = detailDoc.select("div.position");
117 positionElements.remove(); 121 positionElements.remove();
118 122
119 - // 去除底部部分 (div.bottomDPS)  
120 - Elements bottomDpsElements = detailDoc.select("div.bottomDPS");  
121 - bottomDpsElements.remove(); 123 + // 获取正文区域
  124 + Element mainsub = detailDoc.select("div.mainsub").first();
  125 + if (mainsub != null) {
  126 + // 补全正文中的下载文件地址
  127 + Elements downloadLinks = mainsub.select("a[href]");
  128 + for (Element link : downloadLinks) {
  129 + String fileHref = link.attr("href");
  130 + if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) {
  131 + // 补全下载文件地址
  132 + String fullUrl = "http://53.1.230.100" +
  133 + (fileHref.startsWith("/") ? "" : "/") +
  134 + fileHref;
  135 + link.attr("href", fullUrl);
  136 + System.out.println("补全下载地址: " + fullUrl);
  137 + }
  138 + }
122 139
123 - // 补全正文中的下载文件地址  
124 - Elements downloadLinks = detailDoc.select("div.mainsub a[href]");  
125 - for (Element link : downloadLinks) {  
126 - String fileHref = link.attr("href");  
127 - if (fileHref.startsWith("/") || fileHref.startsWith("uploadfiles/")) {  
128 - // 补全下载文件地址  
129 - String fullUrl = "http://53.1.230.100" +  
130 - (fileHref.startsWith("/") ? "" : "/") +  
131 - fileHref;  
132 - link.attr("href", fullUrl); 140 + // 补全正文中的图片地址
  141 + Elements images = mainsub.select("img[src]");
  142 + for (Element img : images) {
  143 + String imgSrc = img.attr("src");
  144 + if (imgSrc.startsWith("/") || imgSrc.startsWith("uploadfiles/")) {
  145 + // 补全图片地址
  146 + String fullUrl = "http://53.1.230.100" +
  147 + (imgSrc.startsWith("/") ? "" : "/") +
  148 + imgSrc;
  149 + img.attr("src", fullUrl);
  150 + System.out.println("补全图片地址: " + fullUrl);
  151 + }
133 } 152 }
134 } 153 }
135 154
@@ -140,7 +159,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService { @@ -140,7 +159,7 @@ public class PoliceNewsCrawlerServiceImpl implements PoliceNewsCrawlerService {
140 String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html"; 159 String fileName = cleanTitle.replaceAll("[\\\\/:*?\"<>|]", "_") + ".html";
141 System.out.println("生成fileName: " + fileName); 160 System.out.println("生成fileName: " + fileName);
142 161
143 - // 保存HTML内容(已去除多余部分并补全下载地址) 162 + // 保存HTML内容(已去除多余部分并补全下载和图片地址)
144 Path filePath = Paths.get(crawlerConfig.getDir(), fileName); 163 Path filePath = Paths.get(crawlerConfig.getDir(), fileName);
145 Files.write(filePath, detailDoc.html().getBytes()); 164 Files.write(filePath, detailDoc.html().getBytes());
146 165