@Override public void computeEnhancements(ContentItem ci) throws EngineException { HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser); Graph model = new SimpleGraph(); ci.getLock().readLock().lock(); try { extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(),null, ci.getMimeType(), model); } catch (ExtractorException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with HtmlExtractor",e); } finally { ci.getLock().readLock().unlock(); } ClerezzaRDFUtils.urifyBlankNodes(model); // make the model single rooted if (singleRootRdf) { ClerezzaRDFUtils.makeConnected(model,ci.getUri(),new IRI(NIE_NS+"contains")); } //add the extracted triples to the metadata of the ContentItem ci.getLock().writeLock().lock(); try { LOG.info("Model: {}",model); ci.getMetadata().addAll(model); model = null; } finally { ci.getLock().writeLock().unlock(); } }
@Override public void computeEnhancements(ContentItem ci) throws EngineException { HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser); Graph model = new SimpleGraph(); ci.getLock().readLock().lock(); try { extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(),null, ci.getMimeType(), model); } catch (ExtractorException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with HtmlExtractor",e); } finally { ci.getLock().readLock().unlock(); } ClerezzaRDFUtils.urifyBlankNodes(model); // make the model single rooted if (singleRootRdf) { ClerezzaRDFUtils.makeConnected(model,ci.getUri(),new IRI(NIE_NS+"contains")); } //add the extracted triples to the metadata of the ContentItem ci.getLock().writeLock().lock(); try { LOG.info("Model: {}",model); ci.getMetadata().addAll(model); model = null; } finally { ci.getLock().writeLock().unlock(); } }
/** * Getter for the contentType. If not set or {@link MediaType#OCTET_STREAM} * than the media type is detected.<p> * This method returns the MediaType and the Stream used to detect the * MimeType. This allows to reuse the stream and the mediaType * @param ci * @param mediaTypeArray * @return */ private MediaTypeAndStream extractMediaType(ContentItem ci) { MediaTypeAndStream mtas = new MediaTypeAndStream(); mtas.mediaType = getMediaType(ci.getBlob()); mtas.uri = ci.getUri().getUnicodeString(); if(mtas.mediaType == null || mtas.mediaType.equals(MediaType.OCTET_STREAM)){ mtas.in = new BufferedInputStream(ci.getStream()); Metadata m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, mtas.uri); try { mtas.mediaType = detector.detect(mtas.in, m); } catch (IOException e) { log.warn("Exception while detection the MediaType of the" + "parsed ContentItem "+ci.getUri(),e); IOUtils.closeQuietly(mtas.in); mtas.in = null; } } return mtas; }
InputStream binaryContent = ci.getStream();
final InputStream in; if(mtas.in == null){ in = ci.getStream(); } else { in = mtas.in;