public PageFetcher(CrawlConfig config) { super(config); HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); params.setBooleanParameter("http.protocol.handle-redirects", false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) { schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); } connectionManager = new PoolingClientConnectionManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } httpClient.addResponseInterceptor(new HttpResponseInterceptor() { @Override public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (connectionMonitorThread == null) { connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }
public synchronized static void startConnectionMonitorThread() { if (connectionMonitorThread == null) { HttpParams params = new BasicHttpParams(); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); params.setParameter("http.useragent", Configurations.getStringProperty("fetcher.user_agent", "crawler4j (http://code.google.com/p/crawler4j/)")); params.setIntParameter("http.socket.timeout", Configurations.getIntProperty("fetcher.socket_timeout", 20000)); params.setIntParameter("http.connection.timeout", Configurations.getIntProperty("fetcher.connection_timeout", 30000)); params.setBooleanParameter("http.protocol.handle-redirects", false); ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean(); connPerRouteBean.setDefaultMaxPerRoute(Configurations.getIntProperty("fetcher.max_connections_per_host", 100)); ConnManagerParams.setMaxConnectionsPerRoute(params, connPerRouteBean); ConnManagerParams.setMaxTotalConnections(params, Configurations.getIntProperty("fetcher.max_total_connections", 100)); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); if (Configurations.getBooleanProperty("fetcher.crawl_https", false)) { schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); } connectionManager = new ThreadSafeClientConnManager(params, schemeRegistry); //ProjectLogger.LOGGER.setLevel(Level.INFO); httpclient = new DefaultHttpClient(connectionManager, params); connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager); } connectionMonitorThread.start(); }