View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.http;
19  
20  import org.apache.commons.io.IOUtils;
21  import org.apache.http.Header;
22  import org.apache.http.HttpResponse;
23  import org.apache.http.client.HttpClient;
24  import org.apache.http.client.config.RequestConfig;
25  import org.apache.http.client.methods.HttpGet;
26  import org.apache.http.client.protocol.HttpClientContext;
27  import org.apache.http.config.SocketConfig;
28  import org.apache.http.impl.client.HttpClients;
29  import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
30  import org.apache.http.message.BasicHeader;
31  
32  import java.io.ByteArrayInputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.net.URI;
36  import java.util.ArrayList;
37  import java.util.List;
38  
39  /**
40   * Opens an {@link InputStream} on an HTTP IRI. Is configured with sane values for timeouts, default headers and so on.
41   *
42   * @author Paolo Capriotti
43   * @author Richard Cyganiak (richard@cyganiak.de)
44   */
45  public class DefaultHTTPClient implements HTTPClient {
46  
47      private final PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
48  
49      private HTTPClientConfiguration configuration;
50  
51      private HttpClient client = null;
52  
53      private long _contentLength = -1;
54  
55      private String actualDocumentIRI = null;
56  
57      private String contentType = null;
58  
59      /**
60       * Creates a {@link DefaultHTTPClient} instance already initialized
61       *
62       * @return populated {@link org.apache.any23.http.DefaultHTTPClient}
63       */
64      public static DefaultHTTPClient createInitializedHTTPClient() {
65          final DefaultHTTPClientltHTTPClient">DefaultHTTPClient defaultHTTPClient = new DefaultHTTPClient();
66          defaultHTTPClient.init(DefaultHTTPClientConfiguration.singleton());
67          return defaultHTTPClient;
68      }
69  
70      public void init(HTTPClientConfiguration configuration) {
71          if (configuration == null)
72              throw new NullPointerException("Illegal configuration, cannot be null.");
73          this.configuration = configuration;
74      }
75  
76      /**
77       *
78       * Opens an {@link java.io.InputStream} from a given IRI. It follows redirects.
79       *
80       * @param uri
81       *            to be opened
82       * 
83       * @return {@link java.io.InputStream}
84       * 
85       * @throws IOException
86       *             if there is an error opening the {@link java.io.InputStream} located at the URI.
87       */
88      public InputStream openInputStream(String uri) throws IOException {
89          HttpGet method = null;
90          try {
91              ensureClientInitialized();
92              HttpClientContext context = HttpClientContext.create();
93              method = new HttpGet(uri);
94              HttpResponse response = client.execute(method, context);
95              List<URI> locations = context.getRedirectLocations();
96  
97              URI actualURI = locations == null || locations.isEmpty() ? method.getURI()
98                      : locations.get(locations.size() - 1);
99              actualDocumentIRI = actualURI.toString();
100 
101             final Header contentTypeHeader = response.getFirstHeader("Content-Type");
102             contentType = contentTypeHeader == null ? null : contentTypeHeader.getValue();
103             if (response.getStatusLine().getStatusCode() != 200) {
104                 throw new IOException("Failed to fetch " + uri + ": " + response.getStatusLine().getStatusCode() + " "
105                         + response.getStatusLine().getReasonPhrase());
106             }
107 
108             byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
109             _contentLength = bytes.length;
110             return new ByteArrayInputStream(bytes);
111         } finally {
112             if (method != null) {
113                 method.reset();
114             }
115         }
116     }
117 
118     /**
119      * Shuts down the connection manager.
120      */
121     public void close() {
122         manager.shutdown();
123     }
124 
125     public long getContentLength() {
126         return _contentLength;
127     }
128 
129     public String getActualDocumentIRI() {
130         return actualDocumentIRI;
131     }
132 
133     public String getContentType() {
134         return contentType;
135     }
136 
137     protected int getConnectionTimeout() {
138         return configuration.getDefaultTimeout();
139     }
140 
141     protected int getSoTimeout() {
142         return configuration.getDefaultTimeout();
143     }
144 
145     private void ensureClientInitialized() {
146         if (configuration == null)
147             throw new IllegalStateException("client must be initialized first.");
148         if (client != null)
149             return;
150 
151         RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(getConnectionTimeout())
152                 .setSocketTimeout(getSoTimeout()).setRedirectsEnabled(true).build();
153 
154         SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(getSoTimeout()).build();
155 
156         List<Header> headers = new ArrayList<>();
157         headers.add(new BasicHeader("User-Agent", configuration.getUserAgent()));
158         if (configuration.getAcceptHeader() != null) {
159             headers.add(new BasicHeader("Accept", configuration.getAcceptHeader()));
160         }
161         headers.add(new BasicHeader("Accept-Language", "en-us,en-gb,en,*;q=0.3")); // TODO: this must become parametric.
162         // headers.add(new BasicHeader("Accept-Encoding", "x-gzip, gzip"));
163         headers.add(new BasicHeader("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
164 
165         client = HttpClients.custom().setConnectionManager(manager).setDefaultRequestConfig(requestConfig)
166                 .setDefaultSocketConfig(socketConfig).setMaxConnTotal(configuration.getMaxConnections())
167                 .setDefaultHeaders(headers).build();
168     }
169 
170 }