private static Charset getCharset(String name) { try { return CharsetUtils.forName(name); } catch (Exception e) { return ASCII; } }
/** * Parses the given date string. This method is synchronized to prevent * concurrent access to the thread-unsafe date formats. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a> * @param date date string * @return parsed date, or <code>null</code> if the date can't be parsed */ private static synchronized Date parseDate(String date) { return DATE_UTILS.tryToParse(date); }
public TikaConfig(Path path) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(path)); } public TikaConfig(Path path, ServiceLoader loader)
/** * Records details of a {@link Parser}'s failure to the * {@link Metadata}, so you can check what went wrong even if the * {@link Exception} wasn't immediately thrown (eg when several different * Parsers are used) */ public static void recordParserFailure(Parser parser, Throwable failure, Metadata metadata) { String trace = ExceptionUtils.getStackTrace(failure); metadata.add(EMBEDDED_EXCEPTION, trace); metadata.add(EMBEDDED_PARSER, getParserClassname(parser)); }
/** * Loads a class and instantiates it * @param className service class name * @param <T> service type * @return instance of service */ public static <T> T newInstance(String className) { return newInstance(className, ServiceLoader.class.getClassLoader()); }
/** * Builds a Document with a DocumentBuilder from the pool * * @since Apache Tika 1.19.1 * @param uriString uriString to process * @return a document * @throws TikaException * @throws IOException * @throws SAXException */ public static Document buildDOM(String uriString) throws TikaException, IOException, SAXException { PoolDOMBuilder builder = acquireDOMBuilder(); try { return builder.getDocumentBuilder().parse(uriString); } finally { releaseDOMBuilder(builder); } }
public static void recordEmbeddedStreamException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, ex); }
/** * Reads a byte from the stream, saving it in the store if it is being * read from the original stream. Implements the abstract * InputStream.read(). * * @return the read byte, or -1 on end of stream. * @throws IOException */ public int read() throws IOException { int inputByte = inputStream.read(); if (firstPass) { saveByte(inputByte); } return inputByte; }
/** * Returns a ISO 8601 representation of the given date. This method * is thread safe and non-blocking. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a> * @param date given date * @return ISO 8601 date string, including timezone details */ public static String formatDate(Calendar date) { // Explicitly switch it into UTC before formatting date.setTimeZone(UTC); return doFormatDate(date); } /**
private static boolean getOSMatchesName(String osNamePrefix) { return isOSNameMatch(OS_NAME, osNamePrefix); }
/** * Records details of the {@link Parser} used to the {@link Metadata}, * typically wanted where multiple parsers could be picked between * or used. */ public static void recordParserDetails(Parser parser, Metadata metadata) { metadata.add(X_PARSED_BY, getParserClassname(parser)); }
/** * Handle various common charset name errors, and return something * that will be considered valid (and is normalized) * * @param charsetName name of charset to process * @return potentially remapped/cleaned up version of charset name */ public static String clean(String charsetName) { try { return forName(charsetName).name(); } catch (Exception e) { return null; } }
@Field(name = "class") public void setRecogniser(String recogniserClass) { this.recogniser = ServiceLoaderUtils.newInstance(recogniserClass); }
public TikaConfig(Path path, ServiceLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(path), loader); }
/** * Builds a Document with a DocumentBuilder from the pool * * @since Apache Tika 1.19.1 * * @return a document * @throws TikaException * @throws IOException * @throws SAXException */ public static Document buildDOM(InputStream is) throws TikaException, IOException, SAXException { PoolDOMBuilder builder = acquireDOMBuilder(); try { return builder.getDocumentBuilder().parse(is); } finally { releaseDOMBuilder(builder); } }
public static void recordException(Throwable t, Metadata m) { String ex = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex); }
public TikaConfig(InputStream stream) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(stream)); }
public TikaConfig(File file) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath())); }
public TikaConfig(File file, ServiceLoader loader) throws TikaException, IOException, SAXException { this(XMLReaderUtils.buildDOM(file.toPath()), loader); }
/** * Builds a Document with a DocumentBuilder from the pool * * @since Apache Tika 1.19.1 * @param path path to parse * @return a document * @throws TikaException * @throws IOException * @throws SAXException */ public static Document buildDOM(Path path) throws TikaException, IOException, SAXException { try (InputStream is = Files.newInputStream(path)){ return buildDOM(is); } }