1. <rp id="c4hsd"></rp>
          1. <button id="c4hsd"><acronym id="c4hsd"><input id="c4hsd"></input></acronym></button>
          2. <tbody id="c4hsd"></tbody>

            <dd id="c4hsd"><pre id="c4hsd"></pre></dd>

              1. <th id="c4hsd"></th>
              2. <s id="c4hsd"><object id="c4hsd"></object></s>

                    [ 登錄 ] - [ 注冊 ] | 代碼示例DEMO | IP測試視頻 |

                    Java配合爬蟲代理IP采集大眾點評店鋪信息

                    作者:數據無憂   時間:2020-09-18 13:24:53
                    大眾點評店鋪網址格式如下:
                    http://www.dianping.com/shop/6000000/
                    http://www.dianping.com/shop/6000001/

                    shop后面的ID是連續的,范圍是1-1500萬,當然有許多店鋪是不存在的(404錯誤),實際的店鋪數量在700萬左右,這里是用的窮舉法,當然也可以進入網頁按深度索引。

                    程序采集過程中會發現大眾點評采取了嚴格的反爬蟲措施,如果一個IP一秒一個進行采集,大概采集500-1000個左右就會出現403錯誤,IP被凍結了,一段時間后才解封,如果凍結了你不死心,繼續大量采,就永久凍結了。

                    其實這個問題很好解決,使用爬蟲代理IP,那403迎刃而解,爬蟲IP網址 http://www.aooseo.com/buy/dynamic.html

                    代碼如下:


                    import java.io.BufferedInputStream;
                    import java.io.InputStream;
                    import java.net.HttpURLConnection;
                    import java.util.ArrayList;
                    import java.util.List;
                    
                    import org.jsoup.Jsoup;
                    import org.jsoup.nodes.Document;
                    import org.jsoup.nodes.Element;
                    import org.jsoup.select.Elements;
                    
                    import com.gargoylesoftware.htmlunit.BrowserVersion;
                    import com.gargoylesoftware.htmlunit.Page;
                    import com.gargoylesoftware.htmlunit.ProxyConfig;
                    import com.gargoylesoftware.htmlunit.WebClient;
                    import com.gargoylesoftware.htmlunit.WebResponse;
                    import com.gargoylesoftware.htmlunit.html.HtmlPage;
                    import com.gargoylesoftware.htmlunit.util.NameValuePair;
                    
                    /**
                     * 這個DEMO主要為了測試爬蟲(動態)代理IP的穩定性
                     * 完美支持企業信息天眼查、電商Ebay、亞馬遜、新浪微博、法院文書、分類信息等
                     * 也可以作為爬蟲參考項目,如需使用,請自行修改webParseHtml方法
                     */
                    public class TestDynamicIpContinue {
                    	
                    	public static List ipList = new ArrayList<>();
                    	public static boolean gameOver = false;
                    	
                    	public static void main(String[] args) throws Exception {
                    		// 每隔幾秒提取一次IP
                    		long fetchIpSeconds = 5;
                    		int testTime = 3;
                    		
                    		// 請填寫無憂代理IP訂單號,填寫之后才可以提取到IP哦
                    		String order = "88888888888888888888888888888";
                    		
                    		// 你要抓去的目標網址
                    		String targetUrl = "http://www.dianping.com/shop/6000000/";
                    		
                    		// 設置referer信息,如果抓取淘寶、天貓需要設置
                    		String referer = "";
                    		// 開啟對https的支持
                    		boolean https = true;
                    		// 是否輸出Header信息
                    		boolean outputHeaderInfo = false;
                    		// 是否加載JS,加載JS會導致速度變慢
                    		boolean useJS = false;
                    		// 請求超時時間,單位毫秒,默認5秒
                    		int timeOut = 10000;
                    		
                    		if (order == null || "".equals(order)) {
                    			System.err.println("請輸入爬蟲(動態)代理訂單號");
                    			return;
                    		}
                    		System.out.println(">>>>>>>>>>>>>>動態IP測試開始<<<<<<<<<<<<<<");
                    		System.out.println("***************");
                    		System.out.println("提取IP間隔 " + fetchIpSeconds + " 秒 ");
                    		System.out.println("爬蟲目標網址  " + targetUrl);
                    		System.out.println("***************\n");
                    		TestDynamicIpContinue tester = new TestDynamicIpContinue();
                    		new Thread(tester.new GetIP(fetchIpSeconds * 1000, testTime, order, targetUrl, useJS, timeOut, referer, https, outputHeaderInfo)).start();
                    	
                    		while(!gameOver){
                    			try {
                    				Thread.sleep(100);
                    			} catch (InterruptedException e) {
                    				e.printStackTrace();
                    			}
                    		}
                    		System.out.println(">>>>>>>>>>>>>>動態IP測試結束<<<<<<<<<<<<<<");
                    		System.exit(0);
                    	}
                        
                    	// 抓取IP138,檢測IP
                    	public class Crawler extends Thread{
                    		@Override
                    		public void run() {
                    			webParseHtml(targetUrl);
                    		}
                    		
                    		long sleepMs = 200;
                    		boolean useJs = false;
                    		String targetUrl = "";
                    		int timeOut = 5000;
                    		String ipport = "";
                    		
                    		String referer;
                    		boolean https;
                    		boolean outputHeaderInfo;
                    		
                    		public Crawler(long sleepMs, String targetUrl, boolean useJs, int timeOut, String ipport, String referer, boolean https, boolean outputHeader) {
                    			this.sleepMs = sleepMs;
                    			this.targetUrl = targetUrl;
                    			this.useJs = useJs;
                    			this.timeOut = timeOut;
                    			this.ipport = ipport;
                    			
                    			this.referer = referer;
                    			this.https = https;
                    			this.outputHeaderInfo = outputHeader;
                    		}
                    		public String webParseHtml(String url) {
                    			String html = "";
                    			BrowserVersion[] versions = { BrowserVersion.CHROME, BrowserVersion.FIREFOX_38, BrowserVersion.INTERNET_EXPLORER_11, BrowserVersion.INTERNET_EXPLORER_8};
                    			WebClient client = new WebClient(versions[(int)(versions.length * Math.random())]);
                    			try {
                    				client.getOptions().setThrowExceptionOnFailingStatusCode(false);
                    				client.getOptions().setJavaScriptEnabled(useJs);
                    				client.getOptions().setCssEnabled(false);
                    				client.getOptions().setThrowExceptionOnScriptError(false);
                    				client.getOptions().setTimeout(timeOut);
                    				client.getOptions().setAppletEnabled(true);
                    				client.getOptions().setGeolocationEnabled(true);
                    				client.getOptions().setRedirectEnabled(true);
                    				
                    				// 對于HTTPS網站,加上這行代碼可以跳過SSL驗證
                    				client.getOptions().setUseInsecureSSL(https);
                    				
                    				if (referer != null && !"".equals(referer)) {
                    					client.addRequestHeader("Referer", referer);
                    				}
                    				
                    				if (ipport != null) {
                    					ProxyConfig proxyConfig = new ProxyConfig((ipport.split(",")[0]).split(":")[0], Integer.parseInt((ipport.split(",")[0]).split(":")[1]));
                    					client.getOptions().setProxyConfig(proxyConfig);
                    				}else {
                    					System.out.print(".");
                    					return "";
                    				}
                    			
                    				long startMs = System.currentTimeMillis();
                    				
                    				Page page = client.getPage(url);
                    				WebResponse response = page.getWebResponse();
                    				
                    				if (outputHeaderInfo) {
                    					// 輸出header信息
                    					List headers = response.getResponseHeaders();
                    					for (NameValuePair nameValuePair : headers) {
                    						System.out.println(nameValuePair.getName() + "-->" + nameValuePair.getValue());
                    					}
                    				}
                    				
                    				boolean isJson = false ;
                    				if (response.getContentType().equals("application/json")) {
                    					html = response.getContentAsString();
                    					isJson = true ;
                    				}else if(page.isHtmlPage()){
                    					html = ((HtmlPage)page).asXml();
                    				}
                    				
                    				long endMs = System.currentTimeMillis();
                    				
                    				Document doc = Jsoup.parse(html);System.out.println(getName() + " " + ipport + " 用時 " + (endMs - startMs) + "毫秒 :" + doc.select("title").text());				
                    } catch (Exception e) { System.err.println(ipport + ":" + e.getMessage()); } finally { client.close(); } return html; } } // 定時獲取動態IP public class GetIP implements Runnable{ long sleepMs = 1000; int maxTime = 3; String order = ""; String targetUrl; boolean useJs; int timeOut; String referer; boolean https; boolean outputHeaderInfo; public GetIP(long sleepMs, int maxTime, String order, String targetUrl, boolean useJs, int timeOut, String referer, boolean https, boolean outputHeaderInfo) { this.sleepMs = sleepMs; this.maxTime = maxTime; this.order = order; this.targetUrl = targetUrl; this.useJs = useJs; this.timeOut = timeOut; this.referer=referer; this.https=https; this.outputHeaderInfo=outputHeaderInfo; } @Override public void run() { int time = 1; while(!gameOver){ if(time >= 4){ gameOver = true; break; } try { java.net.URL url = new java.net.URL("http://api.ip.data5u.com/dynamic/get.html?order=" + order + "&ttl&random=true"); HttpURLConnection connection = (HttpURLConnection)url.openConnection(); connection.setConnectTimeout(3000); connection = (HttpURLConnection)url.openConnection(); InputStream raw = connection.getInputStream(); InputStream in = new BufferedInputStream(raw); byte[] data = new byte[in.available()]; int bytesRead = 0; int offset = 0; while(offset < data.length) { bytesRead = in.read(data, offset, data.length - offset); if(bytesRead == -1) { break; } offset += bytesRead; } in.close(); raw.close(); String[] res = new String(data, "UTF-8").split("\n"); System.out.println(">>>>>>>>>>>>>>當前返回IP量 " + res.length); for (String ip : res) { new Crawler(100, targetUrl, useJs, timeOut, ip, referer, https, outputHeaderInfo).start(); } } catch (Exception e) { System.err.println(">>>>>>>>>>>>>>獲取IP出錯, " + e.getMessage()); } try { Thread.sleep(sleepMs); } catch (InterruptedException e) { e.printStackTrace(); } } } } public String joinList(List list){ StringBuilder re = new StringBuilder(); for (String string : list) { re.append(string).append(","); } return re.toString(); } public String trim(String html) { if (html != null) { return html.replaceAll(" ", "").replaceAll("\n", ""); } return null; } }


                    無憂代理IP(www.aooseo.com)原創文章,轉載請注明出處。

                    電話:4007-745-096
                    QQ:
                    周一至周日8:30-18:00 技術部電話熱線
                    久久夜色精品国产噜噜亚洲AV_老妇女性较大毛片_888亚洲欧美国产va在线播放_超碰人人透人人爽人人看