View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.http;
19  
20  import org.apache.commons.io.IOUtils;
21  import org.apache.http.Header;
22  import org.apache.http.HttpResponse;
23  import org.apache.http.client.HttpClient;
24  import org.apache.http.client.config.RequestConfig;
25  import org.apache.http.client.methods.HttpGet;
26  import org.apache.http.client.protocol.HttpClientContext;
27  import org.apache.http.config.SocketConfig;
28  import org.apache.http.impl.client.HttpClients;
29  import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
30  import org.apache.http.message.BasicHeader;
31  
32  import java.io.ByteArrayInputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.net.URI;
36  import java.util.ArrayList;
37  import java.util.List;
38  
39  /**
40   * Opens an {@link InputStream} on an HTTP IRI. Is configured
41   * with sane values for timeouts, default headers and so on.
42   *
43   * @author Paolo Capriotti
44   * @author Richard Cyganiak (richard@cyganiak.de)
45   */
46  public class DefaultHTTPClient implements HTTPClient {
47  
48      private final PoolingHttpClientConnectionManager manager = new PoolingHttpClientConnectionManager();
49  
50      private HTTPClientConfiguration configuration;
51  
52      private HttpClient client = null;
53  
54      private long _contentLength = -1;
55  
56      private String actualDocumentIRI = null;
57  
58      private String contentType = null;
59  
60  
61      /**
62       * Creates a {@link DefaultHTTPClient} instance already initialized
63       *
64       * @return populated {@link org.apache.any23.http.DefaultHTTPClient}
65       */
66      public static DefaultHTTPClient createInitializedHTTPClient() {
67          final DefaultHTTPClientltHTTPClient">DefaultHTTPClient defaultHTTPClient = new DefaultHTTPClient();
68          defaultHTTPClient.init( DefaultHTTPClientConfiguration.singleton() );
69          return defaultHTTPClient;
70      }
71  
72      public void init(HTTPClientConfiguration configuration) {
73          if(configuration == null) throw new NullPointerException("Illegal configuration, cannot be null.");
74          this.configuration = configuration;
75      }
76  
77      /**
78       *
79       * Opens an {@link java.io.InputStream} from a given IRI.
80       * It follows redirects.
81       *
82       * @param uri to be opened
83       * @return {@link java.io.InputStream}
84       * @throws IOException if there is an error opening the {@link java.io.InputStream}
85       * located at the URI.
86       */
87      public InputStream openInputStream(String uri) throws IOException {
88          HttpGet method = null;
89          try {
90              ensureClientInitialized();
91              HttpClientContext context = HttpClientContext.create();
92              method = new HttpGet(uri);
93              HttpResponse response = client.execute(method, context);
94              List<URI> locations = context.getRedirectLocations();
95  
96              URI actualURI = locations == null || locations.isEmpty() ? method.getURI() : locations.get(locations.size() - 1);
97              actualDocumentIRI = actualURI.toString();
98  
99              final Header contentTypeHeader = response.getFirstHeader("Content-Type");
100             contentType = contentTypeHeader == null ? null : contentTypeHeader.getValue();
101             if (response.getStatusLine().getStatusCode() != 200) {
102                 throw new IOException(
103                         "Failed to fetch " + uri + ": " + response.getStatusLine().getStatusCode() + " " + response.getStatusLine().getReasonPhrase()
104                 );
105             }
106 
107             byte[] bytes = IOUtils.toByteArray(response.getEntity().getContent());
108             _contentLength = bytes.length;
109             return new ByteArrayInputStream(bytes);
110         } finally {
111             if (method != null) {
112                 method.reset();
113             }
114         }
115     }
116 
117     /**
118      * Shuts down the connection manager.
119      */
120     public void close() {
121         manager.shutdown();
122     }
123 
124     public long getContentLength() {
125         return _contentLength;
126     }
127 
128     public String getActualDocumentIRI() {
129         return actualDocumentIRI;
130     }
131 
132     public String getContentType() {
133         return contentType;
134     }
135 
136     protected int getConnectionTimeout() {
137         return configuration.getDefaultTimeout();
138     }
139 
140     protected int getSoTimeout() {
141         return configuration.getDefaultTimeout();
142     }
143 
144     private void ensureClientInitialized() {
145         if (configuration == null)
146             throw new IllegalStateException("client must be initialized first.");
147         if (client != null)
148             return;
149 
150         RequestConfig requestConfig = RequestConfig.custom()
151                 .setConnectTimeout(getConnectionTimeout())
152                 .setSocketTimeout(getSoTimeout())
153                 .setRedirectsEnabled(true)
154                 .build();
155 
156         SocketConfig socketConfig = SocketConfig.custom()
157                 .setSoTimeout(getSoTimeout())
158                 .build();
159 
160         List<Header> headers = new ArrayList<>();
161         headers.add(new BasicHeader("User-Agent", configuration.getUserAgent()));
162         if (configuration.getAcceptHeader() != null) {
163             headers.add(new BasicHeader("Accept", configuration.getAcceptHeader()));
164         }
165         headers.add(new BasicHeader("Accept-Language", "en-us,en-gb,en,*;q=0.3")); //TODO: this must become parametric.
166         // headers.add(new BasicHeader("Accept-Encoding", "x-gzip, gzip"));
167         headers.add(new BasicHeader("Accept-Charset", "utf-8,iso-8859-1;q=0.7,*;q=0.5"));
168 
169 
170         client = HttpClients.custom()
171                 .setConnectionManager(manager)
172                 .setDefaultRequestConfig(requestConfig)
173                 .setDefaultSocketConfig(socketConfig)
174                 .setMaxConnTotal(configuration.getMaxConnections())
175                 .setDefaultHeaders(headers)
176                 .build();
177     }
178 
179 }