/* |
= |
/* |
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
* contributor license agreements. See the NOTICE file distributed with |
|
* contributor license agreements. See the NOTICE file distributed with |
* this work for additional information regarding copyright ownership. |
|
* this work for additional information regarding copyright ownership. |
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
* (the "License"); you may not use this file except in compliance with |
|
* (the "License"); you may not use this file except in compliance with |
* the License. You may obtain a copy of the License at |
|
* the License. You may obtain a copy of the License at |
* |
|
* |
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
* |
|
* |
* Unless required by applicable law or agreed to in writing, software |
|
* Unless required by applicable law or agreed to in writing, software |
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
* See the License for the specific language governing permissions and |
|
* See the License for the specific language governing permissions and |
* limitations under the License. |
|
* limitations under the License. |
*/ |
|
*/ |
package org.apache.tika.parser.microsoft; |
|
package org.apache.tika.parser.microsoft; |
|
|
|
import java.io.FileNotFoundException; |
+- |
|
import java.io.IOException; |
= |
import java.io.IOException; |
import java.util.HashSet; |
|
import java.util.HashSet; |
|
|
|
import org.apache.poi.hslf.HSLFSlideShow; |
|
import org.apache.poi.hslf.HSLFSlideShow; |
import org.apache.poi.hslf.model.*; |
|
import org.apache.poi.hslf.model.*; |
import org.apache.poi.hslf.usermodel.ObjectData; |
|
import org.apache.poi.hslf.usermodel.ObjectData; |
import org.apache.poi.hslf.usermodel.PictureData; |
|
import org.apache.poi.hslf.usermodel.PictureData; |
import org.apache.poi.hslf.usermodel.SlideShow; |
|
import org.apache.poi.hslf.usermodel.SlideShow; |
import org.apache.poi.poifs.filesystem.DirectoryNode; |
|
import org.apache.poi.poifs.filesystem.DirectoryNode; |
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; |
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; |
import org.apache.tika.exception.TikaException; |
|
import org.apache.tika.exception.TikaException; |
import org.apache.tika.io.CloseShieldInputStream; |
+- |
|
import org.apache.tika.io.TikaInputStream; |
= |
import org.apache.tika.io.TikaInputStream; |
import org.apache.tika.parser.ParseContext; |
|
import org.apache.tika.parser.ParseContext; |
import org.apache.tika.sax.XHTMLContentHandler; |
|
import org.apache.tika.sax.XHTMLContentHandler; |
import org.xml.sax.SAXException; |
|
import org.xml.sax.SAXException; |
import org.xml.sax.helpers.AttributesImpl; |
|
import org.xml.sax.helpers.AttributesImpl; |
|
|
|
public class HSLFExtractor extends AbstractPOIFSExtractor { |
|
public class HSLFExtractor extends AbstractPOIFSExtractor { |
public HSLFExtractor(ParseContext context) { |
|
public HSLFExtractor(ParseContext context) { |
super(context); |
|
super(context); |
} |
|
} |
|
<> |
|
protected void parse( |
= |
protected void parse( |
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) |
|
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) |
throws IOException, SAXException, TikaException { |
|
throws IOException, SAXException, TikaException { |
parse(filesystem.getRoot(), xhtml); |
|
parse(filesystem.getRoot(), xhtml); |
} |
|
} |
|
|
|
protected void parse( |
|
protected void parse( |
DirectoryNode root, XHTMLContentHandler xhtml) |
|
DirectoryNode root, XHTMLContentHandler xhtml) |
throws IOException, SAXException, TikaException { |
|
throws IOException, SAXException, TikaException { |
HSLFSlideShow ss = new HSLFSlideShow(root); |
|
HSLFSlideShow ss = new HSLFSlideShow(root); |
SlideShow _show = new SlideShow(ss); |
|
SlideShow _show = new SlideShow(ss); |
Slide[] _slides = _show.getSlides(); |
|
Slide[] _slides = _show.getSlides(); |
|
|
|
xhtml.startElement("div", "class", "slideShow"); |
|
xhtml.startElement("div", "class", "slideShow"); |
|
|
|
/* Iterate over slides and extract text */ |
|
/* Iterate over slides and extract text */ |
for( Slide slide : _slides ) { |
|
for( Slide slide : _slides ) { |
xhtml.startElement("div", "class", "slide"); |
|
xhtml.startElement("div", "class", "slide"); |
|
|
|
// Slide header, if present |
|
// Slide header, if present |
HeadersFooters hf = slide.getHeadersFooters(); |
|
HeadersFooters hf = slide.getHeadersFooters(); |
try |
|
try |
{ |
|
{ |
if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { |
|
if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { |
xhtml.startElement("p", "class", "slide-header"); |
|
xhtml.startElement("p", "class", "slide-header"); |
|
|
|
xhtml.characters( hf.getHeaderText() ); |
|
xhtml.characters( hf.getHeaderText() ); |
|
|
|
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
} catch (Exception e) |
|
} catch (Exception e) |
{ |
|
{ |
//do nothing |
|
//do nothing |
} |
|
} |
|
|
|
// Slide master, if present |
|
// Slide master, if present |
extractMaster(xhtml, slide.getMasterSheet()); |
|
extractMaster(xhtml, slide.getMasterSheet()); |
|
|
|
// Slide text |
|
// Slide text |
{ |
|
{ |
xhtml.startElement("p", "class", "slide-content"); |
|
xhtml.startElement("p", "class", "slide-content"); |
|
|
|
textRunsToText(xhtml, slide.getTextRuns()); |
|
textRunsToText(xhtml, slide.getTextRuns()); |
|
|
|
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
|
|
|
// Table text |
|
// Table text |
for (Shape shape: slide.getShapes()){ |
|
for (Shape shape: slide.getShapes()){ |
if (shape instanceof Table){ |
|
if (shape instanceof Table){ |
extractTableText(xhtml, (Table)shape); |
|
extractTableText(xhtml, (Table)shape); |
} |
|
} |
} |
|
} |
|
|
|
try |
|
try |
{ |
|
{ |
// Slide footer, if present |
|
// Slide footer, if present |
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { |
|
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { |
xhtml.startElement("p", "class", "slide-footer"); |
|
xhtml.startElement("p", "class", "slide-footer"); |
|
|
|
xhtml.characters( hf.getFooterText() ); |
|
xhtml.characters( hf.getFooterText() ); |
|
|
|
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
} catch(Exception e) |
|
} catch(Exception e) |
{ |
|
{ |
//do nothig |
|
//do nothig |
} |
|
} |
|
|
|
// Comments, if present |
|
// Comments, if present |
for( Comment comment : slide.getComments() ) { |
|
for( Comment comment : slide.getComments() ) { |
xhtml.startElement("p", "class", "slide-comment"); |
|
xhtml.startElement("p", "class", "slide-comment"); |
if (comment.getAuthor() != null) { |
|
if (comment.getAuthor() != null) { |
xhtml.startElement("b"); |
|
xhtml.startElement("b"); |
xhtml.characters( comment.getAuthor() ); |
|
xhtml.characters( comment.getAuthor() ); |
xhtml.endElement("b"); |
|
xhtml.endElement("b"); |
|
|
|
if (comment.getText() != null) { |
|
if (comment.getText() != null) { |
xhtml.characters( " - "); |
|
xhtml.characters( " - "); |
} |
|
} |
} |
|
} |
if (comment.getText() != null) { |
|
if (comment.getText() != null) { |
xhtml.characters( comment.getText() ); |
|
xhtml.characters( comment.getText() ); |
} |
|
} |
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
|
<> |
|
// Now any embedded resources |
= |
// Now any embedded resources |
handleSlideEmbeddedResources(slide, xhtml); |
|
handleSlideEmbeddedResources(slide, xhtml); |
|
|
|
// TODO Find the Notes for this slide and extract inline |
|
// TODO Find the Notes for this slide and extract inline |
|
|
|
// Slide complete |
|
// Slide complete |
xhtml.endElement("div"); |
|
xhtml.endElement("div"); |
} |
|
} |
|
|
|
// All slides done |
|
// All slides done |
xhtml.endElement("div"); |
|
xhtml.endElement("div"); |
|
|
|
/* notes */ |
|
/* notes */ |
xhtml.startElement("div", "class", "slideNotes"); |
|
xhtml.startElement("div", "class", "slideNotes"); |
HashSet<Integer> seenNotes = new HashSet<Integer>(); |
|
HashSet<Integer> seenNotes = new HashSet<Integer>(); |
HeadersFooters hf = _show.getNotesHeadersFooters(); |
|
HeadersFooters hf = _show.getNotesHeadersFooters(); |
|
|
|
for (Slide slide : _slides) { |
|
for (Slide slide : _slides) { |
Notes notes = slide.getNotesSheet(); |
|
Notes notes = slide.getNotesSheet(); |
if (notes == null) { |
|
if (notes == null) { |
continue; |
|
continue; |
} |
|
} |
Integer id = Integer.valueOf(notes._getSheetNumber()); |
|
Integer id = Integer.valueOf(notes._getSheetNumber()); |
if (seenNotes.contains(id)) { |
|
if (seenNotes.contains(id)) { |
continue; |
|
continue; |
} |
|
} |
seenNotes.add(id); |
|
seenNotes.add(id); |
|
|
|
// Repeat the Notes header, if set |
|
// Repeat the Notes header, if set |
if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { |
|
if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) { |
xhtml.startElement("p", "class", "slide-note-header"); |
|
xhtml.startElement("p", "class", "slide-note-header"); |
xhtml.characters( hf.getHeaderText() ); |
|
xhtml.characters( hf.getHeaderText() ); |
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
|
|
|
// Notes text |
|
// Notes text |
textRunsToText(xhtml, notes.getTextRuns()); |
|
textRunsToText(xhtml, notes.getTextRuns()); |
|
|
|
// Repeat the notes footer, if set |
|
// Repeat the notes footer, if set |
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { |
|
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { |
xhtml.startElement("p", "class", "slide-note-footer"); |
|
xhtml.startElement("p", "class", "slide-note-footer"); |
xhtml.characters( hf.getFooterText() ); |
|
xhtml.characters( hf.getFooterText() ); |
xhtml.endElement("p"); |
|
xhtml.endElement("p"); |
} |
|
} |
} |
|
} |
|
|
|
handleSlideEmbeddedPictures(_show, xhtml); |
|
handleSlideEmbeddedPictures(_show, xhtml); |
|
|
|
xhtml.endElement("div"); |
|
xhtml.endElement("div"); |
} |
|
} |
|
|
|
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException { |
|
private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException { |
if (master == null){ |
|
if (master == null){ |
return; |
|
return; |
} |
|
} |
Shape[] shapes = master.getShapes(); |
|
Shape[] shapes = master.getShapes(); |
if (shapes == null || shapes.length == 0){ |
|
if (shapes == null || shapes.length == 0){ |
return; |
|
return; |
} |
|
} |
|
|
|
xhtml.startElement("div", "class", "slide-master-content"); |
|
xhtml.startElement("div", "class", "slide-master-content"); |
for (int i = 0; i < shapes.length; i++){ |
|
for (int i = 0; i < shapes.length; i++){ |
Shape sh = shapes[i]; |
|
Shape sh = shapes[i]; |
if (sh != null && ! MasterSheet.isPlaceholder(sh)){ |
|
if (sh != null && ! MasterSheet.isPlaceholder(sh)){ |
if (sh instanceof TextShape){ |
|
if (sh instanceof TextShape){ |
TextShape tsh = (TextShape)sh; |
|
TextShape tsh = (TextShape)sh; |
String text = tsh.getText(); |
|
String text = tsh.getText(); |
if (text != null){ |
|
if (text != null){ |
xhtml.element("p", text); |
|
xhtml.element("p", text); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
xhtml.endElement("div"); |
|
xhtml.endElement("div"); |
} |
|
} |
|
|
|
private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException { |
|
private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException { |
xhtml.startElement("table"); |
|
xhtml.startElement("table"); |
for (int row = 0; row < shape.getNumberOfRows(); row++){ |
|
for (int row = 0; row < shape.getNumberOfRows(); row++){ |
xhtml.startElement("tr"); |
|
xhtml.startElement("tr"); |
for (int col = 0; col < shape.getNumberOfColumns(); col++){ |
|
for (int col = 0; col < shape.getNumberOfColumns(); col++){ |
TableCell cell = shape.getCell(row, col); |
|
TableCell cell = shape.getCell(row, col); |
//insert empty string for empty cell if cell is null |
|
//insert empty string for empty cell if cell is null |
String txt = ""; |
|
String txt = ""; |
if (cell != null){ |
|
if (cell != null){ |
txt = cell.getText(); |
|
txt = cell.getText(); |
} |
|
} |
xhtml.element("td", txt); |
|
xhtml.element("td", txt); |
} |
|
} |
xhtml.endElement("tr"); |
|
xhtml.endElement("tr"); |
} |
|
} |
xhtml.endElement("table"); |
|
xhtml.endElement("table"); |
} |
|
} |
|
|
|
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException { |
|
private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException { |
if (runs==null) { |
|
if (runs==null) { |
return; |
|
return; |
} |
|
} |
|
|
|
for (TextRun run : runs) { |
|
for (TextRun run : runs) { |
if (run != null) { |
|
if (run != null) { |
// Leaving in wisdom from TIKA-712 for easy revert. |
|
// Leaving in wisdom from TIKA-712 for easy revert. |
// Avoid boiler-plate text on the master slide (0 |
|
// Avoid boiler-plate text on the master slide (0 |
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): |
|
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): |
//if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { |
|
//if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { |
String txt = run.getText(); |
|
String txt = run.getText(); |
if (txt != null){ |
|
if (txt != null){ |
xhtml.characters(txt); |
|
xhtml.characters(txt); |
xhtml.startElement("br"); |
|
xhtml.startElement("br"); |
xhtml.endElement("br"); |
|
xhtml.endElement("br"); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
|
|
|
private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml) |
|
private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml) |
throws TikaException, SAXException, IOException { |
|
throws TikaException, SAXException, IOException { |
for (PictureData pic : slideshow.getPictureData()) { |
|
for (PictureData pic : slideshow.getPictureData()) { |
String mediaType = null; |
|
String mediaType = null; |
|
|
|
switch (pic.getType()) { |
|
switch (pic.getType()) { |
case Picture.EMF: |
|
case Picture.EMF: |
mediaType = "application/x-emf"; |
|
mediaType = "application/x-emf"; |
break; |
|
break; |
case Picture.JPEG: |
|
case Picture.JPEG: |
mediaType = "image/jpeg"; |
|
mediaType = "image/jpeg"; |
break; |
|
break; |
case Picture.PNG: |
|
case Picture.PNG: |
mediaType = "image/png"; |
|
mediaType = "image/png"; |
break; |
|
break; |
case Picture.WMF: |
|
case Picture.WMF: |
mediaType = "application/x-msmetafile"; |
|
mediaType = "application/x-msmetafile"; |
break; |
|
break; |
case Picture.DIB: |
|
case Picture.DIB: |
mediaType = "image/bmp"; |
|
mediaType = "image/bmp"; |
break; |
|
break; |
} |
|
} |
|
|
|
handleEmbeddedResource( |
|
handleEmbeddedResource( |
TikaInputStream.get(pic.getData()), null, null, |
|
TikaInputStream.get(pic.getData()), null, null, |
mediaType, xhtml, false); |
|
mediaType, xhtml, false); |
} |
|
} |
} |
|
} |
|
|
|
private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml) |
|
private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml) |
throws TikaException, SAXException, IOException { |
|
throws TikaException, SAXException, IOException { |
Shape[] shapes; |
|
Shape[] shapes; |
try { |
|
try { |
shapes = slide.getShapes(); |
|
shapes = slide.getShapes(); |
} catch(NullPointerException e) { |
|
} catch(NullPointerException e) { |
// Sometimes HSLF hits problems |
|
// Sometimes HSLF hits problems |
// Please open POI bugs for any you come across! |
|
// Please open POI bugs for any you come across! |
return; |
|
return; |
} |
|
} |
|
|
|
for( Shape shape : shapes ) { |
|
for( Shape shape : shapes ) { |
if( shape instanceof OLEShape ) { |
|
if( shape instanceof OLEShape ) { |
OLEShape oleShape = (OLEShape)shape; |
|
OLEShape oleShape = (OLEShape)shape; |
ObjectData data = null; |
|
ObjectData data = null; |
try { |
|
try { |
data = oleShape.getObjectData(); |
|
data = oleShape.getObjectData(); |
} catch( NullPointerException e ) { |
|
} catch( NullPointerException e ) { |
/* getObjectData throws NPE some times. */ |
|
/* getObjectData throws NPE some times. */ |
} |
|
} |
|
|
|
if (data != null) { |
|
if (data != null) { |
String objID = Integer.toString(oleShape.getObjectID()); |
|
String objID = Integer.toString(oleShape.getObjectID()); |
|
|
|
// Embedded Object: add a <div |
|
// Embedded Object: add a <div |
// class="embedded" id="X"/> so consumer can see where |
|
// class="embedded" id="X"/> so consumer can see where |
// in the main text each embedded document |
|
// in the main text each embedded document |
// occurred: |
|
// occurred: |
AttributesImpl attributes = new AttributesImpl(); |
|
AttributesImpl attributes = new AttributesImpl(); |
attributes.addAttribute("", "class", "class", "CDATA", "embedded"); |
|
attributes.addAttribute("", "class", "class", "CDATA", "embedded"); |
attributes.addAttribute("", "id", "id", "CDATA", objID); |
|
attributes.addAttribute("", "id", "id", "CDATA", objID); |
xhtml.startElement("div", attributes); |
|
xhtml.startElement("div", attributes); |
xhtml.endElement("div"); |
|
xhtml.endElement("div"); |
|
|
|
TikaInputStream stream = |
|
TikaInputStream stream = |
TikaInputStream.get(data.getData()); |
|
TikaInputStream.get(data.getData()); |
|
+- |
|
final DirectoryNode root; |
|
|
TikaInputStream tstream = TikaInputStream.cast(stream); |
|
|
if (tstream == null) { |
|
|
root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot(); |
|
|
} else { |
|
|
final Object container = tstream.getOpenContainer(); |
|
|
if (container instanceof NPOIFSFileSystem) { |
|
|
root = ((NPOIFSFileSystem) container).getRoot(); |
|
|
} else if (container instanceof DirectoryNode) { |
|
|
root = (DirectoryNode) container; |
|
|
} else if (tstream.hasFile()) { |
|
|
root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot(); |
|
|
} else { |
|
|
root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot(); |
|
|
} |
|
|
} |
|
|
|
|
|
//POIFSDocumentType type = POIFSDocumentType.detectType(root); |
|
|
|
|
|
//System.out.println("POIFSDocumentType type = " +type); |
|
|
try{ |
|
|
handleEmbeddedOfficeDoc(root, xhtml); |
|
|
} catch(FileNotFoundException e){ |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
try { |
= |
try { |
String mediaType = null; |
|
String mediaType = null; |
if ("Excel.Chart.8".equals(oleShape.getProgID())) { |
|
if ("Excel.Chart.8".equals(oleShape.getProgID())) { |
mediaType = "application/vnd.ms-excel"; |
|
mediaType = "application/vnd.ms-excel"; |
} |
|
} |
handleEmbeddedResource( |
|
handleEmbeddedResource( |
stream, objID, objID, |
|
stream, objID, objID, |
mediaType, xhtml, false); |
|
mediaType, xhtml, false); |
} finally { |
|
} finally { |
stream.close(); |
|
stream.close(); |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |
} |
|
} |