Index: tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java (revision 1704732) +++ tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java (revision ) @@ -21,15 +21,12 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import org.apache.tika.metadata.Metadata; import org.junit.Test; @@ -38,13 +35,13 @@ @Test public void testFileBased() throws IOException { - File file = createTempFile("Hello, World!"); - InputStream stream = TikaInputStream.get(file); + Path path = createTempFile("Hello, World!"); + InputStream stream = TikaInputStream.get(path); assertEquals( "The file returned by the getFile() method should" + " be the file used to instantiate a TikaInputStream", - file, TikaInputStream.get(stream).getFile()); + path, TikaInputStream.get(stream).getPath()); assertEquals( "The contents of the TikaInputStream should equal the" @@ -54,20 +51,19 @@ stream.close(); assertTrue( "The close() method must not remove the file used to" - + " instantiate a TikaInputStream", + + " instantiate a TikaInputStream", - file.exists()); + Files.exists(path)); - file.delete(); + Files.delete(path); } @Test public void testStreamBased() throws IOException { - InputStream input = - new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)); + InputStream input = IOUtils.toInputStream("Hello, World!", UTF_8.name()); InputStream stream = TikaInputStream.get(input); - File file = TikaInputStream.get(stream).getFile(); - assertTrue(file != null && file.isFile()); + Path file = TikaInputStream.get(stream).getPath(); + assertTrue(file != null && Files.isRegularFile(file)); assertEquals( "The contents of the file returned by the getFile method" @@ -83,27 +79,21 @@ assertFalse( "The close() method must remove the temporary file created" + " by a TikaInputStream", - file.exists()); + Files.exists(file)); } - private File createTempFile(String data) throws IOException { - File file = File.createTempFile("tika-", ".tmp"); - try (OutputStream stream = new FileOutputStream(file)) { - stream.write(data.getBytes(UTF_8)); - } + private Path createTempFile(String data) throws IOException { + Path file = Files.createTempFile("tika-", ".tmp"); + Files.write(file, data.getBytes(UTF_8)); return file; } - private String readFile(File file) throws IOException { - try (InputStream stream = new FileInputStream(file)) { - return readStream(stream); + private String readFile(Path file) throws IOException { + return new String(Files.readAllBytes(file), UTF_8); - } + } - } private String readStream(InputStream stream) throws IOException { - ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - IOUtils.copy(stream, buffer); - return buffer.toString(UTF_8.name()); + return IOUtils.toString(stream, UTF_8.name()); } @Test @@ -113,7 +103,7 @@ TikaInputStream.get(url, metadata).close(); assertEquals("test.txt", metadata.get(Metadata.RESOURCE_NAME_KEY)); assertEquals( - Long.toString(new File(url.toURI()).length()), + Long.toString(Files.size(Paths.get(url.toURI()))), metadata.get(Metadata.CONTENT_LENGTH)); } Index: tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (revision 1704732) +++ tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (revision ) @@ -22,20 +22,23 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.sql.Blob; import java.sql.SQLException; import org.apache.tika.metadata.Metadata; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + /** * Input stream with extended capabilities. The purpose of this class is * to allow files and other resources and information to be associated with @@ -193,6 +196,39 @@ } /** + * Creates a TikaInputStream from the file at the given path. + *
+ * Note that you must always explicitly close the returned stream to + * prevent leaking open file handles. + * + * @param path input file + * @return a TikaInputStream instance + * @throws IOException if an I/O error occurs + */ + public static TikaInputStream get(Path path) throws IOException { + return get(path, new Metadata()); + } + + /** + * Creates a TikaInputStream from the file at the given path. The file name + * and length are stored as input metadata in the given metadata instance. + *
+ * Note that you must always explicitly close the returned stream to + * prevent leaking open file handles. + * + * @param path input file + * @param metadata metadata instance + * @return a TikaInputStream instance + * @throws IOException if an I/O error occurs + */ + public static TikaInputStream get(Path path, Metadata metadata) + throws IOException { + metadata.set(Metadata.RESOURCE_NAME_KEY, path.getFileName().toString()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(path))); + return new TikaInputStream(path); + } + + /** * Creates a TikaInputStream from the given file. *
* Note that you must always explicitly close the returned stream to
@@ -201,6 +237,7 @@
* @param file input file
* @return a TikaInputStream instance
* @throws FileNotFoundException if the file does not exist
+ * @see #get(Path)
*/
public static TikaInputStream get(File file) throws FileNotFoundException {
return get(file, new Metadata());
@@ -217,6 +254,8 @@
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws FileNotFoundException if the file does not exist
+ * or cannot be opened for reading
+ * @see #get(Path, Metadata)
*/
public static TikaInputStream get(File file, Metadata metadata)
throws FileNotFoundException {
@@ -314,9 +353,9 @@
throws IOException {
// Special handling for file:// URIs
if ("file".equalsIgnoreCase(uri.getScheme())) {
- File file = new File(uri);
- if (file.isFile()) {
- return get(file, metadata);
+ Path path = Paths.get(uri);
+ if (Files.isRegularFile(path)) {
+ return get(path, metadata);
}
}
@@ -354,9 +393,9 @@
// Special handling for file:// URLs
if ("file".equalsIgnoreCase(url.getProtocol())) {
try {
- File file = new File(url.toURI());
- if (file.isFile()) {
- return get(file, metadata);
+ Path path = Paths.get(url.toURI());
+ if (Files.isRegularFile(path)) {
+ return get(path, metadata);
}
} catch (URISyntaxException e) {
// fall through
@@ -392,13 +431,13 @@
}
/**
- * The file that contains the contents of this stream. This is either
- * the original file passed to the {@link #TikaInputStream(File)}
- * constructor or a temporary file created by a call to the
- * {@link #getFile()} method. If neither has been called, then
- * the value is null
.
+ * The path to the file that contains the contents of this stream.
+ * This is either the original file passed to the
+ * {@link #TikaInputStream(Path)} constructor or a temporary file created
+ * by a call to the {@link #getPath()} method. If neither has been called,
+ * then the value is null
.
*/
- private File file;
+ private Path path;
/**
* Tracker of temporary resources.
@@ -431,12 +470,27 @@
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
*
+ * @param path the path to the file that contains the stream
+ * @throws IOException if an I/O error occurs
+ */
+ private TikaInputStream(Path path) throws IOException {
+ super(new BufferedInputStream(Files.newInputStream(path)));
+ this.path = path;
+ this.tmp = new TemporaryResources();
+ this.length = Files.size(path);
+ }
+
+ /**
+ * Creates a TikaInputStream instance. This private constructor is used
+ * by the static factory methods based on the available information.
+ *
* @param file the file that contains the stream
* @throws FileNotFoundException if the file does not exist
+ * @see #TikaInputStream(Path)
*/
private TikaInputStream(File file) throws FileNotFoundException {
super(new BufferedInputStream(new FileInputStream(file)));
- this.file = file;
+ this.path = file.toPath();
this.tmp = new TemporaryResources();
this.length = file.length();
}
@@ -456,7 +510,7 @@
private TikaInputStream(
InputStream stream, TemporaryResources tmp, long length) {
super(stream);
- this.file = null;
+ this.path = null;
this.tmp = tmp;
this.length = length;
}
@@ -515,22 +569,20 @@
}
public boolean hasFile() {
- return file != null;
+ return path != null;
}
- public File getFile() throws IOException {
- if (file == null) {
+ public Path getPath() throws IOException {
+ if (path == null) {
if (position > 0) {
throw new IOException("Stream is already being read");
} else {
// Spool the entire stream into a temporary file
- file = tmp.createTemporaryFile();
- try (OutputStream out = new FileOutputStream(file)) {
- IOUtils.copy(in, out);
- }
+ path = tmp.createTempFile();
+ Files.copy(in, path, REPLACE_EXISTING);
// Create a new input stream and make sure it'll get closed
- FileInputStream newStream = new FileInputStream(file);
+ InputStream newStream = Files.newInputStream(path);
tmp.addResource(newStream);
// Replace the spooled stream with the new stream in a way
@@ -545,16 +597,21 @@
}
};
- length = file.length();
+ length = Files.size(path);
}
}
- return file;
+ return path;
}
+ /**
+ * @see #getPath()
+ */
+ public File getFile() throws IOException {
+ return getPath().toFile();
+ }
+
public FileChannel getFileChannel() throws IOException {
- FileInputStream fis = new FileInputStream(getFile());
- tmp.addResource(fis);
- FileChannel channel = fis.getChannel();
+ FileChannel channel = FileChannel.open(getPath());
tmp.addResource(channel);
return channel;
}
@@ -566,7 +623,7 @@
/**
* Returns the length (in bytes) of this stream. Note that if the length
* was not available when this stream was instantiated, then this method
- * will use the {@link #getFile()} method to buffer the entire stream to
+ * will use the {@link #getPath()} method to buffer the entire stream to
* a temporary file in order to calculate the stream length. This case
* will only work if the stream has not yet been consumed.
*
@@ -575,7 +632,7 @@
*/
public long getLength() throws IOException {
if (length == -1) {
- length = getFile().length();
+ getPath(); // updates length internally
}
return length;
}
@@ -616,7 +673,7 @@
@Override
public void close() throws IOException {
- file = null;
+ path = null;
mark = -1;
// The close method was explicitly called, so we indeed
@@ -638,7 +695,7 @@
public String toString() {
String str = "TikaInputStream of ";
if (hasFile()) {
- str += file.toString();
+ str += path.toString();
} else {
str += in.toString();
}