/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.mspowerpoint; import java.io.ByteArrayInputStream; import java.util.Properties; import java.util.logging.Logger; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.OutlinkExtractor; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.LogFormatter; /** * Nutch-Parser for parsing MS PowerPoint slides ( mime type * application/vnd.ms-powerpoint). *
* It is based on org.apache.poi.*.
*
* @author Stephan Strittmatter - http://www.sybit.de
*
* @version 1.0
*/
public class MSPowerPointParser implements Parser {
private static final Logger LOG = LogFormatter
.getLogger(MSPowerPointParser.class.getName());
/**
*
*/
public MSPowerPointParser() {
}
/**
* Parses the MS PowerPoint file.
*
* @see org.apache.nutch.parse.Parser#getParse(Content)
*/
public Parse getParse(final Content content) { // throws ParseException {
// check that contentType is one we can handle
final String contentType = content.getContentType();
if (contentType != null
&& !contentType.startsWith(PPTConstants.MIME_TYPE)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT, "Content-Type not "
+ PPTConstants.MIME_TYPE + " was: " + contentType).getEmptyParse();
}
String plainText = null;
String title = null;
Outlink[] outlinks = null;
Properties properties = null;
try {
final String contentLen = content.get("Content-Length");
final int len = Integer.parseInt(contentLen);
final byte[] contentInOctets = content.getContent();
final ByteArrayInputStream input = new ByteArrayInputStream(
contentInOctets);
if (contentLen != null && contentInOctets.length != len) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "
+ contentInOctets.length
+ " bytes(!= "
+ contentLen
+ "). Please increase