Provides text extraction, metadata extraction, mime-type detection, text-encoding detection and language detection. All via a native Java bridge with the Apache Tika content-analysis toolkit. Bundles Tika 1.5.
Depends on node-java, which itself requires the JDK and Python 2 (not 3) to compile.
Requires JDK 7. Run node version to check the version that node-java is using. If the wrong version is reported even if you installed JDK 1.7, make sure JAVA_HOME is set to the correct path then delete node_modules/java and rerun npm install.
var tika = require('tika');Extract both text and metadata from a file. Content type is optional but would help Tika in some cases.
tika.extract('test/data/file.pdf', function(err, text, meta) {
assert.equal(text.trim(), 'Just some text.');
assert.deepEqual(meta.producer, 'LibreOffice 4.1');
});Extract text from a file.
tika.text('test/data/file.pdf', function(err, text) {
assert.equal(text.trim(), 'Just some text.');
});Extract metadata from a file. Returns an object with names as keys.
tika.meta('test/data/file.pdf', function(err, meta) {
assert.deepEqual(meta.producer, 'LibreOffice 4.1');
});Detect the content-type of a file.
tika.type('test/data/file.pdf', function(err, contentType) {
assert.equal(contentType, 'application/pdf');
});The withCharset parameter defaults to false. If set to true, then the charset will be appended to the mime-type.
tika.type('test/data/file.txt', true, function(err, contentType) {
assert.equal(contentType, 'text/plain; charset=ISO-8859-1');
});Detect the character set (text encoding) of a file.
tika.charset('test/data/file.txt', true, function(err, charset) {
assert.equal(charset, 'ISO-8859-1');
});Detect the language a given string is written in.
tika.language('This is just some text in English.', function(err, language, reasonablyCertain) {
assert.equal(language, 'en');
});Developed by Matthew Caruana Galizia. Please feel free to submit an issue or pull request.
Copyright (c) 2013 Matthew Caruana Galizia. Licensed under an MIT-style license.
Apache Tika JAR distributed under the Apache License, Version 2.0.
