git-svn-id: https://192.168.0.254/svn/Proyectos.Incam_SGD/tags/3.7.0.2_original@1 eb19766c-00d9-a042-a3a0-45cb8ec72764
901 lines
20 KiB
PHP
901 lines
20 KiB
PHP
<?php
|
|
|
|
/**
|
|
* $Id:$
|
|
*
|
|
* KnowledgeTree Community Edition
|
|
* Document Management Made Simple
|
|
* Copyright (C) 2008, 2009 KnowledgeTree Inc.
|
|
*
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it under
|
|
* the terms of the GNU General Public License version 3 as published by the
|
|
* Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
|
|
* California 94120-7775, or email info@knowledgetree.com.
|
|
*
|
|
* The interactive user interfaces in modified source and object code versions
|
|
* of this program must display Appropriate Legal Notices, as required under
|
|
* Section 5 of the GNU General Public License version 3.
|
|
*
|
|
* In accordance with Section 7(b) of the GNU General Public License version 3,
|
|
* these Appropriate Legal Notices must retain the display of the "Powered by
|
|
* KnowledgeTree" logo and retain the original copyright notice. If the display of the
|
|
* logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
|
|
* must display the words "Powered by KnowledgeTree" and retain the original
|
|
* copyright notice.
|
|
* Contributor( s): ______________________________________
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* DocumentExtractor is the base class for all text extractors.
|
|
*
|
|
*/
|
|
abstract class DocumentExtractor
|
|
{
|
|
/**
|
|
* The source filename from which to extract text.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $sourcefile;
|
|
|
|
/**
|
|
* The target filename, where the extracted text must be stored.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $targetfile;
|
|
|
|
/**
|
|
* The mime type of the source file.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $mimetype;
|
|
|
|
/**
|
|
* The extension of the source file.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $extension;
|
|
|
|
/**
|
|
* Reference to the document being indexed.
|
|
*
|
|
* @var Document
|
|
*/
|
|
protected $document;
|
|
|
|
/**
|
|
* Indicates if the extractor needs an intermediate file or not.
|
|
* Generally the source file will be a file within the respository itself. Some extractors may
|
|
* require the source file to have the correct extension. Setting this to true will result in
|
|
* a file being created with the extension of the file. It is ideal to disable this if possible.
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $needsIntermediate;
|
|
|
|
/**
|
|
* The status of the extraction. If null, the extraction has not been done yet.
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $extractionStatus;
|
|
|
|
/**
|
|
* The status of the indexing. If null, the indexing has not been done yet.
|
|
*
|
|
* @var boolean
|
|
*/
|
|
protected $indexStatus;
|
|
|
|
/**
|
|
* If an error occurred, this is the output that was captured
|
|
*
|
|
* @var string
|
|
*/
|
|
public $output;
|
|
|
|
|
|
public function __construct()
|
|
{
|
|
$this->needsIntermediate=false;
|
|
$this->extractionStatus = null;
|
|
$this->indexStatus = null;
|
|
}
|
|
|
|
/**
|
|
* Sets the status of the indexing.
|
|
*
|
|
* @param unknown_type $status
|
|
*/
|
|
public function setIndexingStatus($status)
|
|
{
|
|
$this->indexStatus = $status;
|
|
}
|
|
/**
|
|
* Returns the indexing status.
|
|
*
|
|
* @return boolean
|
|
*/
|
|
public function getIndexingStatus()
|
|
{
|
|
return $this->indexStatus;
|
|
}
|
|
|
|
/**
|
|
* Sets the extraction status.
|
|
*
|
|
* @param boolean $status
|
|
*/
|
|
public function setExtractionStatus($status)
|
|
{
|
|
$this->extractionStatus = $status;
|
|
}
|
|
/**
|
|
* Return the extraction status.
|
|
*
|
|
* @return boolean
|
|
*/
|
|
public function getExtractionStatus()
|
|
{
|
|
return $this->extractionStatus;
|
|
}
|
|
|
|
/**
|
|
* This associates all the mime types associated with the extractor class.
|
|
*
|
|
*/
|
|
public function registerMimeTypes()
|
|
{
|
|
$types = $this->getSupportedMimeTypes();
|
|
if (empty($types))
|
|
{
|
|
return;
|
|
}
|
|
$classname=get_class($this);
|
|
|
|
$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
|
|
$rs = DBUtil::getResultArray($sql);
|
|
if (count($rs) == 0)
|
|
{
|
|
$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
|
|
}
|
|
else
|
|
{
|
|
$extractor_id = $rs[0]['extractor_id'];
|
|
}
|
|
|
|
|
|
foreach($types as $type)
|
|
{
|
|
$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
|
|
$rs = DBUtil::runQuery($sql);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Indicates if an intermediate file is required.
|
|
*
|
|
* @param $value boolean Optional. If set, we set the value.
|
|
* @return boolean
|
|
*/
|
|
public function needsIntermediateSourceFile($value = null)
|
|
{
|
|
if (!is_null($value))
|
|
{
|
|
$this->needsIntermediate = $value;
|
|
}
|
|
return $this->needsIntermediate;
|
|
}
|
|
|
|
/**
|
|
* Sets the source filename for the document extractor.
|
|
*
|
|
* @param string $sourcefile
|
|
*/
|
|
public function setSourceFile($sourcefile)
|
|
{
|
|
$this->sourcefile=$sourcefile;
|
|
}
|
|
|
|
/**
|
|
* Returns the source file name.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getSourceFile() { return $this->sourcefile; }
|
|
|
|
/**
|
|
* Sets the source file's mime type.
|
|
*
|
|
* @param string $mimetype
|
|
*/
|
|
public function setMimeType($mimetype)
|
|
{
|
|
$this->mimetype=$mimetype;
|
|
}
|
|
/**
|
|
* Returns the mime type for the source file.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getMimeType() { return $this->mimetype; }
|
|
|
|
/**
|
|
* Indicates the extension for the source file.
|
|
*
|
|
* @param string $extension
|
|
*/
|
|
public function setExtension($extension)
|
|
{
|
|
$this->extension=$extension;
|
|
}
|
|
/**
|
|
* Returns the extension of the source file.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getExtension() { return $this->extension; }
|
|
|
|
/**
|
|
* Sets the file name of the target text file.
|
|
*
|
|
* @param string $targetfile
|
|
*/
|
|
public function setTargetFile($targetfile)
|
|
{
|
|
$this->targetfile=$targetfile;
|
|
}
|
|
|
|
/**
|
|
* Gets the file name of the target text file containing the extracted text.
|
|
*
|
|
* @return unknown
|
|
*/
|
|
public function getTargetFile() { return $this->targetfile; }
|
|
|
|
/**
|
|
* Filter function that may be applied after extraction. This may be overridden.
|
|
*
|
|
* @param string $text
|
|
* @return string
|
|
*/
|
|
protected function filter($text)
|
|
{
|
|
return $text;
|
|
}
|
|
|
|
/**
|
|
* Set the document that will be indexed.
|
|
*
|
|
* @param Document $document
|
|
*/
|
|
public function setDocument($document)
|
|
{
|
|
$this->document = $document;
|
|
}
|
|
|
|
/**
|
|
* Returns a reference to the document.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getDocument()
|
|
{
|
|
return $this->document;
|
|
}
|
|
|
|
/**
|
|
* Returns an array of supported mime types.
|
|
* e.g. return array('plain/text');
|
|
*
|
|
*
|
|
* @return array
|
|
*
|
|
*/
|
|
public abstract function getSupportedMimeTypes();
|
|
|
|
/**
|
|
* Extracts the content from the source file.
|
|
*
|
|
* @return boolean
|
|
*/
|
|
public abstract function extractTextContent();
|
|
|
|
/**
|
|
* Returns a friendly name for the document text extractor.
|
|
*
|
|
* @return string
|
|
*/
|
|
public abstract function getDisplayName();
|
|
|
|
/**
|
|
* Attempts to diagnose any problems with the indexing process.
|
|
*
|
|
* @return string
|
|
*/
|
|
public abstract function diagnose();
|
|
|
|
}
|
|
|
|
/**
|
|
* This class extends the document extractor to execute some command line application.
|
|
* The getCommandLine() method needs to be overridden.
|
|
*
|
|
*/
|
|
abstract class ExternalDocumentExtractor extends DocumentExtractor
|
|
{
|
|
protected $allowOutput = false;
|
|
protected $pipeStdoutToDevNull = false;
|
|
|
|
/**
|
|
* Initialise the extractor.
|
|
*
|
|
*/
|
|
public function __construct()
|
|
{
|
|
parent::__construct();
|
|
putenv('LANG=en_US.UTF-8');
|
|
|
|
$config = KTConfig::getSingleton();
|
|
|
|
$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));
|
|
|
|
putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));
|
|
}
|
|
|
|
public function setAllowOutput($allowOutput)
|
|
{
|
|
$this->allowOutput = $allowOutput;
|
|
}
|
|
|
|
/**
|
|
* Executes a command. Returns true if successful.
|
|
*
|
|
* @param string $cmd A command line instruction.
|
|
* @return boolean
|
|
*/
|
|
protected function exec($cmd)
|
|
{
|
|
$config = KTConfig::getSingleton();
|
|
$temp_dir = $config->get('urls/tmpDirectory');
|
|
$res = 0;
|
|
|
|
$docid = $this->document->getId();
|
|
|
|
$script_prefix = $temp_dir . '/' . time() . '-' . $docid;
|
|
$script_out = $script_prefix . '.out';
|
|
|
|
// define the scripts that we want
|
|
|
|
if (OS_WINDOWS)
|
|
{
|
|
$script_name = $script_prefix . '.bat';
|
|
|
|
$script = "rem This is an auto generated file. \n";
|
|
$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";
|
|
$script .= "set er=%ERRORLEVEL%\r\n";
|
|
$script .= "exit /B %er%\r\n";
|
|
}
|
|
else
|
|
{
|
|
$script_name = $script_prefix . '.sh';
|
|
|
|
$script = "#!/bin/sh\n";
|
|
$script .= "# This is an auto generated file. \n";
|
|
$script .= $cmd . ' 2>>"' . $script_out . "\"";
|
|
|
|
if ($this->pipeStdoutToDevNull)
|
|
{
|
|
$script .= " >/dev/null";
|
|
}
|
|
|
|
$script .= "\n";
|
|
|
|
$script .= "exit $?\n";
|
|
}
|
|
|
|
// write the script file
|
|
if (file_put_contents($script_name, $script) === false)
|
|
{
|
|
$this->output = _kt('Could not create exec script: ') . $script_name;
|
|
return false;
|
|
}
|
|
|
|
// execute the script file
|
|
if (OS_WINDOWS)
|
|
{
|
|
$res = KTUtil::pexec("\"$script_name\"");
|
|
$res = $res['ret'];
|
|
}
|
|
else
|
|
{
|
|
if (chmod($script_name, 0755) === false)
|
|
{
|
|
$this->output = _kt('Could change permission on exec script: ') . $script_name;
|
|
return false;
|
|
}
|
|
system($script_name, $res);
|
|
}
|
|
|
|
// remote the script file and get the output if available
|
|
@unlink($script_name);
|
|
|
|
if (file_exists($script_out))
|
|
{
|
|
$this->output = file_get_contents($script_out);
|
|
@unlink($script_out);
|
|
}
|
|
|
|
return ($res == 0) && (empty($this->output) || $this->allowOutput);
|
|
}
|
|
|
|
/**
|
|
* Returns the command line string to be executed.
|
|
* The command returned should include the target filename.
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function getCommandLine()
|
|
{
|
|
throw new Exception(_kt('getCommandLine is not implemented'));
|
|
}
|
|
|
|
/**
|
|
* Executes the command that executes the command.
|
|
* Returns true if success.
|
|
*
|
|
* @return boolean
|
|
*/
|
|
public function extractTextContent()
|
|
{
|
|
global $default;
|
|
|
|
$cmdline = $this->getCommandLine();
|
|
|
|
$class = get_class($this);
|
|
$default->log->debug("$class: " . $cmdline);
|
|
|
|
return $this->exec($cmdline);
|
|
}
|
|
|
|
}
|
|
|
|
abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor
|
|
{
|
|
protected $cmd;
|
|
protected $params;
|
|
|
|
/**
|
|
* Enter description here...
|
|
*
|
|
* @var StarOfficeExtractor
|
|
*/
|
|
protected $oo;
|
|
|
|
public function __construct($cmd, $params)
|
|
{
|
|
parent::__construct();
|
|
$this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);
|
|
|
|
$config = KTConfig::getSingleton();
|
|
$this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);
|
|
$this->useOO = false; //$config->get('indexer/useOpenOffice', true);
|
|
if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS)
|
|
{
|
|
$this->cmd = false;
|
|
}
|
|
|
|
if ($this->useOO)
|
|
{
|
|
// require_once('extractors/StarOfficeExtractor.inc.php');
|
|
// $this->oo = new StarOfficeExtractor();
|
|
}
|
|
}
|
|
|
|
public function needsIntermediateSourceFile()
|
|
{
|
|
// we need the intermediate file because it
|
|
// has the correct extension. documentConverter uses the extension to determine mimetype
|
|
|
|
return ($this->useOO);
|
|
}
|
|
|
|
protected function getCommandLine()
|
|
{
|
|
$sourcefile = $this->sourcefile;
|
|
$targetfile = $this->targetfile;
|
|
$escape = '"';
|
|
|
|
$cmd = $this->cmd;
|
|
|
|
$cmdline = $this->params;
|
|
$cmdline = eval("return \"$cmdline\";");
|
|
|
|
$cmdline = str_replace('\\','/',$cmdline);
|
|
|
|
return $cmdline;
|
|
}
|
|
|
|
|
|
public function extractTextContent()
|
|
{
|
|
if ($this->cmd !== false)
|
|
{
|
|
// so we have catppt or something
|
|
$result = parent::extractTextContent();
|
|
if ($result !== false)
|
|
{
|
|
// if it returns true, we can bail
|
|
return true;
|
|
}
|
|
|
|
// if failure, fallthrough, and attempt OO
|
|
}
|
|
|
|
/*
|
|
if ($this->useOO)
|
|
{
|
|
$this->oo->setSourceFile($this->sourcefile);
|
|
$this->oo->setMimeType($this->mimetype);
|
|
$this->oo->setExtension($this->extension);
|
|
$this->oo->setTargetFile($this->targetfile);
|
|
$this->oo->setDocument($this->document);
|
|
$this->oo->setIndexingStatus(null);
|
|
$this->oo->setExtractionStatus(null);
|
|
|
|
$result = $this->oo->extractTextContent();
|
|
|
|
$this->setIndexingStatus($this->oo->getIndexingStatus());
|
|
$this->setExtractionStatus($this->oo->getExtractionStatus());
|
|
$this->setTargetFile($this->oo->getTargetFile());
|
|
|
|
return $result;
|
|
}
|
|
else
|
|
{
|
|
*/
|
|
global $default;
|
|
$docId = $this->document->getId();
|
|
$cmd = $this->cmd;
|
|
$default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");
|
|
file_put_contents($this->targetfile, '');
|
|
return true;
|
|
//}
|
|
}
|
|
|
|
public function diagnose()
|
|
{
|
|
if ($this->cmd !== false) // || !$this->useOO)
|
|
{
|
|
// cmd is found. we don't care about oo.
|
|
// if we can't use oo, well, not much we can do....
|
|
return null;
|
|
}
|
|
|
|
return false; //$this->oo->diagnose();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* An extension to the extenal document extractor. A derived class simply needs
|
|
* to implement a constructor and getSupportedMimeTypes().
|
|
*
|
|
*/
|
|
abstract class ApplicationExtractor extends ExternalDocumentExtractor
|
|
{
|
|
/**
|
|
* The full path to the application that will be run. This will be resolved from
|
|
* the path or using the config file.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $application;
|
|
/**
|
|
* The command name of the application that can be run.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $command;
|
|
/**
|
|
* This is the friendly name for the extractor.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $displayname;
|
|
/**
|
|
* The command line parameters for the application.
|
|
* This may include {source} and {target} where substitutions will be done.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $params;
|
|
|
|
/**
|
|
* Initialise the extractor.
|
|
*
|
|
* @param string $section The section in the config file.
|
|
* @param string $appname The application name in the config file.
|
|
* @param string $command The command that can be run.
|
|
* @param string $displayname
|
|
* @param string $params
|
|
*/
|
|
public function __construct($section, $appname, $command, $displayname, $params)
|
|
{
|
|
parent::__construct();
|
|
|
|
$this->application = KTUtil::findCommand("$section/$appname", $command);
|
|
$this->command = $command;
|
|
$this->displayname = $displayname;
|
|
$this->params = $params;
|
|
}
|
|
|
|
/**
|
|
* Return the display name.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getDisplayName()
|
|
{
|
|
return sprintf(_kt('%s') , $this->displayname);
|
|
}
|
|
|
|
/**
|
|
* Returns the command line after performing substitutions.
|
|
*
|
|
* @return unknown
|
|
*/
|
|
protected function getCommandLine()
|
|
{
|
|
$sources = array('{source}','{target}');
|
|
$target = array($this->sourcefile, $this->targetfile);
|
|
$escape = OS_WINDOWS?'"':'\'';
|
|
$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);
|
|
|
|
return $cmdline;
|
|
}
|
|
|
|
/**
|
|
* Identifies if there are any circumstances why the command can not run that could result in the text extraction process
|
|
* failing.
|
|
*
|
|
* @return mixed Returns string if there is a problem, null otherwise.
|
|
*/
|
|
public function diagnose()
|
|
{
|
|
if (false === $this->application)
|
|
{
|
|
return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
abstract class TextExtractor extends DocumentExtractor
|
|
{
|
|
/**
|
|
* This extracts the text from the document.
|
|
*
|
|
* @return boolean
|
|
*/
|
|
public function extractTextContent()
|
|
{
|
|
$config = KTConfig::getSingleton();
|
|
$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
|
|
$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);
|
|
if (false === $content)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
$result = file_put_contents($this->targetfile, $this->filter($content));
|
|
|
|
return false !== $result;
|
|
}
|
|
|
|
/**
|
|
* There are no external dependancies to diagnose.
|
|
*
|
|
* @return null
|
|
*/
|
|
public function diagnose()
|
|
{
|
|
return null;
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.
|
|
*
|
|
*/
|
|
abstract class CompositeExtractor extends DocumentExtractor
|
|
{
|
|
/**
|
|
* The initial extractor
|
|
*
|
|
* @var DocumentExtractor
|
|
*/
|
|
private $sourceExtractor;
|
|
/**
|
|
* The text extractor
|
|
*
|
|
* @var DocumentExtractor
|
|
*/
|
|
private $targetExtractor;
|
|
/**
|
|
* The extension for the initial extraction
|
|
*
|
|
* @var string
|
|
*/
|
|
private $targetExtension;
|
|
/**
|
|
* The mime type of the initial extraction.
|
|
*
|
|
* @var string
|
|
*/
|
|
private $targetMimeType;
|
|
|
|
public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)
|
|
{
|
|
$this->sourceExtractor = $sourceExtractor;
|
|
$this->targetExtractor = $targetExtractor;
|
|
$this->targetExtension = $targetExtension;
|
|
$this->targetMimeType = $targetMimeType;
|
|
$this->needsIntermediateSourceFile($needsIntermediate);
|
|
}
|
|
|
|
/**
|
|
* Extracts the content of the document
|
|
*
|
|
* @return string
|
|
*/
|
|
public function extractTextContent()
|
|
{
|
|
$intermediateFile = $this->targetfile . '.' . $this->targetExtension;
|
|
touch($intermediateFile);
|
|
|
|
$this->sourceExtractor->setSourceFile($this->sourcefile);
|
|
$this->sourceExtractor->setTargetFile($intermediateFile);
|
|
$this->sourceExtractor->setDocument($this->getDocument());
|
|
$this->sourceExtractor->setMimeType($this->mimetype);
|
|
$this->sourceExtractor->setExtension($this->extension);
|
|
if (!$this->sourceExtractor->extractTextContent())
|
|
{
|
|
$this->output = $this->sourceExtractor->output;
|
|
@unlink($intermediateFile);
|
|
return false;
|
|
}
|
|
$intermediateFile = $this->sourceExtractor->getTargetFile();
|
|
|
|
$this->targetExtractor->setSourceFile($intermediateFile);
|
|
$this->targetExtractor->setTargetFile($this->targetfile);
|
|
$this->targetExtractor->setDocument($this->getDocument());
|
|
$this->targetExtractor->setMimeType($this->targetMimeType);
|
|
$this->targetExtractor->setExtension($this->targetExtension);
|
|
$result = $this->targetExtractor->extractTextContent();
|
|
if (!$result)
|
|
{
|
|
$this->output = $this->targetExtractor->output;
|
|
}
|
|
|
|
@unlink($intermediateFile);
|
|
$this->setTargetFile($this->targetExtractor->getTargetFile());
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Diagnose the extractors
|
|
*
|
|
* @return mixed
|
|
*/
|
|
public function diagnose()
|
|
{
|
|
$diagnosis = $this->sourceExtractor->diagnose();
|
|
if (!empty($diagnosis))
|
|
{
|
|
return $diagnosis;
|
|
}
|
|
|
|
$diagnosis = $this->targetExtractor->diagnose();
|
|
if (!empty($diagnosis))
|
|
{
|
|
return $diagnosis;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* The purpose of an extractor hook is to effect the
|
|
*
|
|
*/
|
|
abstract class ExtractorHook
|
|
{
|
|
/**
|
|
* Returns an array of supported mime types.
|
|
* e.g. return array('plain/text');
|
|
*
|
|
*
|
|
* @return array
|
|
*
|
|
*/
|
|
public abstract function getSupportedMimeTypes();
|
|
|
|
/**
|
|
* Returns the friendly name for the hook.
|
|
*
|
|
* @return string
|
|
*/
|
|
public abstract function getDisplayName();
|
|
|
|
/**
|
|
* This does a basic diagnosis on the hook.
|
|
*
|
|
* @return string
|
|
*/
|
|
public function diagnose()
|
|
{
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Perform any pre extraction activities.
|
|
*
|
|
* @param DocumentExtractor $extractor
|
|
*/
|
|
public function pre_extract($extractor)
|
|
{
|
|
}
|
|
|
|
/**
|
|
* Perform any post extraction activities.
|
|
*
|
|
* @param DocumentExtractor $extractor
|
|
*/
|
|
public function post_extract($extractor)
|
|
{
|
|
|
|
}
|
|
|
|
/**
|
|
* Perform any pre indexing activities.
|
|
*
|
|
* @param DocumentExtractor $extractor
|
|
*/
|
|
public function pre_index($extractor)
|
|
{
|
|
|
|
}
|
|
|
|
/**
|
|
* Perform any post indexing activities.
|
|
*
|
|
* @param DocumentExtractor $extractor
|
|
*/
|
|
public function post_index($extractor)
|
|
{
|
|
|
|
}
|
|
}
|
|
|
|
?>
|