This repository has been archived on 2024-11-28. You can view files and clone it, but cannot push or open issues or pull requests.
Incam_SGD/search2/indexing/extractorCore.inc.php

901 lines
20 KiB
PHP
Raw Normal View History

<?php
/**
* $Id:$
*
* KnowledgeTree Community Edition
* Document Management Made Simple
* Copyright (C) 2008, 2009 KnowledgeTree Inc.
*
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License version 3 as published by the
* Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
* California 94120-7775, or email info@knowledgetree.com.
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU General Public License version 3.
*
* In accordance with Section 7(b) of the GNU General Public License version 3,
* these Appropriate Legal Notices must retain the display of the "Powered by
* KnowledgeTree" logo and retain the original copyright notice. If the display of the
* logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
* must display the words "Powered by KnowledgeTree" and retain the original
* copyright notice.
* Contributor( s): ______________________________________
*
*/
/**
* DocumentExtractor is the base class for all text extractors.
*
*/
abstract class DocumentExtractor
{
/**
* The source filename from which to extract text.
*
* @var string
*/
protected $sourcefile;
/**
* The target filename, where the extracted text must be stored.
*
* @var string
*/
protected $targetfile;
/**
* The mime type of the source file.
*
* @var string
*/
protected $mimetype;
/**
* The extension of the source file.
*
* @var string
*/
protected $extension;
/**
* Reference to the document being indexed.
*
* @var Document
*/
protected $document;
/**
* Indicates if the extractor needs an intermediate file or not.
* Generally the source file will be a file within the respository itself. Some extractors may
* require the source file to have the correct extension. Setting this to true will result in
* a file being created with the extension of the file. It is ideal to disable this if possible.
*
* @var boolean
*/
protected $needsIntermediate;
/**
* The status of the extraction. If null, the extraction has not been done yet.
*
* @var boolean
*/
protected $extractionStatus;
/**
* The status of the indexing. If null, the indexing has not been done yet.
*
* @var boolean
*/
protected $indexStatus;
/**
* If an error occurred, this is the output that was captured
*
* @var string
*/
public $output;
public function __construct()
{
$this->needsIntermediate=false;
$this->extractionStatus = null;
$this->indexStatus = null;
}
/**
* Sets the status of the indexing.
*
* @param unknown_type $status
*/
public function setIndexingStatus($status)
{
$this->indexStatus = $status;
}
/**
* Returns the indexing status.
*
* @return boolean
*/
public function getIndexingStatus()
{
return $this->indexStatus;
}
/**
* Sets the extraction status.
*
* @param boolean $status
*/
public function setExtractionStatus($status)
{
$this->extractionStatus = $status;
}
/**
* Return the extraction status.
*
* @return boolean
*/
public function getExtractionStatus()
{
return $this->extractionStatus;
}
/**
* This associates all the mime types associated with the extractor class.
*
*/
public function registerMimeTypes()
{
$types = $this->getSupportedMimeTypes();
if (empty($types))
{
return;
}
$classname=get_class($this);
$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
$rs = DBUtil::getResultArray($sql);
if (count($rs) == 0)
{
$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
}
else
{
$extractor_id = $rs[0]['extractor_id'];
}
foreach($types as $type)
{
$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
$rs = DBUtil::runQuery($sql);
}
}
/**
* Indicates if an intermediate file is required.
*
* @param $value boolean Optional. If set, we set the value.
* @return boolean
*/
public function needsIntermediateSourceFile($value = null)
{
if (!is_null($value))
{
$this->needsIntermediate = $value;
}
return $this->needsIntermediate;
}
/**
* Sets the source filename for the document extractor.
*
* @param string $sourcefile
*/
public function setSourceFile($sourcefile)
{
$this->sourcefile=$sourcefile;
}
/**
* Returns the source file name.
*
* @return string
*/
public function getSourceFile() { return $this->sourcefile; }
/**
* Sets the source file's mime type.
*
* @param string $mimetype
*/
public function setMimeType($mimetype)
{
$this->mimetype=$mimetype;
}
/**
* Returns the mime type for the source file.
*
* @return string
*/
public function getMimeType() { return $this->mimetype; }
/**
* Indicates the extension for the source file.
*
* @param string $extension
*/
public function setExtension($extension)
{
$this->extension=$extension;
}
/**
* Returns the extension of the source file.
*
* @return string
*/
public function getExtension() { return $this->extension; }
/**
* Sets the file name of the target text file.
*
* @param string $targetfile
*/
public function setTargetFile($targetfile)
{
$this->targetfile=$targetfile;
}
/**
* Gets the file name of the target text file containing the extracted text.
*
* @return unknown
*/
public function getTargetFile() { return $this->targetfile; }
/**
* Filter function that may be applied after extraction. This may be overridden.
*
* @param string $text
* @return string
*/
protected function filter($text)
{
return $text;
}
/**
* Set the document that will be indexed.
*
* @param Document $document
*/
public function setDocument($document)
{
$this->document = $document;
}
/**
* Returns a reference to the document.
*
* @return string
*/
public function getDocument()
{
return $this->document;
}
/**
* Returns an array of supported mime types.
* e.g. return array('plain/text');
*
*
* @return array
*
*/
public abstract function getSupportedMimeTypes();
/**
* Extracts the content from the source file.
*
* @return boolean
*/
public abstract function extractTextContent();
/**
* Returns a friendly name for the document text extractor.
*
* @return string
*/
public abstract function getDisplayName();
/**
* Attempts to diagnose any problems with the indexing process.
*
* @return string
*/
public abstract function diagnose();
}
/**
* This class extends the document extractor to execute some command line application.
* The getCommandLine() method needs to be overridden.
*
*/
abstract class ExternalDocumentExtractor extends DocumentExtractor
{
protected $allowOutput = false;
protected $pipeStdoutToDevNull = false;
/**
* Initialise the extractor.
*
*/
public function __construct()
{
parent::__construct();
putenv('LANG=en_US.UTF-8');
$config = KTConfig::getSingleton();
$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));
putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));
}
public function setAllowOutput($allowOutput)
{
$this->allowOutput = $allowOutput;
}
/**
* Executes a command. Returns true if successful.
*
* @param string $cmd A command line instruction.
* @return boolean
*/
protected function exec($cmd)
{
$config = KTConfig::getSingleton();
$temp_dir = $config->get('urls/tmpDirectory');
$res = 0;
$docid = $this->document->getId();
$script_prefix = $temp_dir . '/' . time() . '-' . $docid;
$script_out = $script_prefix . '.out';
// define the scripts that we want
if (OS_WINDOWS)
{
$script_name = $script_prefix . '.bat';
$script = "rem This is an auto generated file. \n";
$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";
$script .= "set er=%ERRORLEVEL%\r\n";
$script .= "exit /B %er%\r\n";
}
else
{
$script_name = $script_prefix . '.sh';
$script = "#!/bin/sh\n";
$script .= "# This is an auto generated file. \n";
$script .= $cmd . ' 2>>"' . $script_out . "\"";
if ($this->pipeStdoutToDevNull)
{
$script .= " >/dev/null";
}
$script .= "\n";
$script .= "exit $?\n";
}
// write the script file
if (file_put_contents($script_name, $script) === false)
{
$this->output = _kt('Could not create exec script: ') . $script_name;
return false;
}
// execute the script file
if (OS_WINDOWS)
{
$res = KTUtil::pexec("\"$script_name\"");
$res = $res['ret'];
}
else
{
if (chmod($script_name, 0755) === false)
{
$this->output = _kt('Could change permission on exec script: ') . $script_name;
return false;
}
system($script_name, $res);
}
// remote the script file and get the output if available
@unlink($script_name);
if (file_exists($script_out))
{
$this->output = file_get_contents($script_out);
@unlink($script_out);
}
return ($res == 0) && (empty($this->output) || $this->allowOutput);
}
/**
* Returns the command line string to be executed.
* The command returned should include the target filename.
*
* @return string
*/
protected function getCommandLine()
{
throw new Exception(_kt('getCommandLine is not implemented'));
}
/**
* Executes the command that executes the command.
* Returns true if success.
*
* @return boolean
*/
public function extractTextContent()
{
global $default;
$cmdline = $this->getCommandLine();
$class = get_class($this);
$default->log->debug("$class: " . $cmdline);
return $this->exec($cmdline);
}
}
abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor
{
protected $cmd;
protected $params;
/**
* Enter description here...
*
* @var StarOfficeExtractor
*/
protected $oo;
public function __construct($cmd, $params)
{
parent::__construct();
$this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);
$config = KTConfig::getSingleton();
$this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);
$this->useOO = false; //$config->get('indexer/useOpenOffice', true);
if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS)
{
$this->cmd = false;
}
if ($this->useOO)
{
// require_once('extractors/StarOfficeExtractor.inc.php');
// $this->oo = new StarOfficeExtractor();
}
}
public function needsIntermediateSourceFile()
{
// we need the intermediate file because it
// has the correct extension. documentConverter uses the extension to determine mimetype
return ($this->useOO);
}
protected function getCommandLine()
{
$sourcefile = $this->sourcefile;
$targetfile = $this->targetfile;
$escape = '"';
$cmd = $this->cmd;
$cmdline = $this->params;
$cmdline = eval("return \"$cmdline\";");
$cmdline = str_replace('\\','/',$cmdline);
return $cmdline;
}
public function extractTextContent()
{
if ($this->cmd !== false)
{
// so we have catppt or something
$result = parent::extractTextContent();
if ($result !== false)
{
// if it returns true, we can bail
return true;
}
// if failure, fallthrough, and attempt OO
}
/*
if ($this->useOO)
{
$this->oo->setSourceFile($this->sourcefile);
$this->oo->setMimeType($this->mimetype);
$this->oo->setExtension($this->extension);
$this->oo->setTargetFile($this->targetfile);
$this->oo->setDocument($this->document);
$this->oo->setIndexingStatus(null);
$this->oo->setExtractionStatus(null);
$result = $this->oo->extractTextContent();
$this->setIndexingStatus($this->oo->getIndexingStatus());
$this->setExtractionStatus($this->oo->getExtractionStatus());
$this->setTargetFile($this->oo->getTargetFile());
return $result;
}
else
{
*/
global $default;
$docId = $this->document->getId();
$cmd = $this->cmd;
$default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");
file_put_contents($this->targetfile, '');
return true;
//}
}
public function diagnose()
{
if ($this->cmd !== false) // || !$this->useOO)
{
// cmd is found. we don't care about oo.
// if we can't use oo, well, not much we can do....
return null;
}
return false; //$this->oo->diagnose();
}
}
/**
* An extension to the extenal document extractor. A derived class simply needs
* to implement a constructor and getSupportedMimeTypes().
*
*/
abstract class ApplicationExtractor extends ExternalDocumentExtractor
{
/**
* The full path to the application that will be run. This will be resolved from
* the path or using the config file.
*
* @var string
*/
private $application;
/**
* The command name of the application that can be run.
*
* @var string
*/
private $command;
/**
* This is the friendly name for the extractor.
*
* @var string
*/
private $displayname;
/**
* The command line parameters for the application.
* This may include {source} and {target} where substitutions will be done.
*
* @var string
*/
private $params;
/**
* Initialise the extractor.
*
* @param string $section The section in the config file.
* @param string $appname The application name in the config file.
* @param string $command The command that can be run.
* @param string $displayname
* @param string $params
*/
public function __construct($section, $appname, $command, $displayname, $params)
{
parent::__construct();
$this->application = KTUtil::findCommand("$section/$appname", $command);
$this->command = $command;
$this->displayname = $displayname;
$this->params = $params;
}
/**
* Return the display name.
*
* @return string
*/
public function getDisplayName()
{
return sprintf(_kt('%s') , $this->displayname);
}
/**
* Returns the command line after performing substitutions.
*
* @return unknown
*/
protected function getCommandLine()
{
$sources = array('{source}','{target}');
$target = array($this->sourcefile, $this->targetfile);
$escape = OS_WINDOWS?'"':'\'';
$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);
return $cmdline;
}
/**
* Identifies if there are any circumstances why the command can not run that could result in the text extraction process
* failing.
*
* @return mixed Returns string if there is a problem, null otherwise.
*/
public function diagnose()
{
if (false === $this->application)
{
return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);
}
return null;
}
}
abstract class TextExtractor extends DocumentExtractor
{
/**
* This extracts the text from the document.
*
* @return boolean
*/
public function extractTextContent()
{
$config = KTConfig::getSingleton();
$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);
if (false === $content)
{
return false;
}
$result = file_put_contents($this->targetfile, $this->filter($content));
return false !== $result;
}
/**
* There are no external dependancies to diagnose.
*
* @return null
*/
public function diagnose()
{
return null;
}
}
/**
* The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.
*
*/
abstract class CompositeExtractor extends DocumentExtractor
{
/**
* The initial extractor
*
* @var DocumentExtractor
*/
private $sourceExtractor;
/**
* The text extractor
*
* @var DocumentExtractor
*/
private $targetExtractor;
/**
* The extension for the initial extraction
*
* @var string
*/
private $targetExtension;
/**
* The mime type of the initial extraction.
*
* @var string
*/
private $targetMimeType;
public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)
{
$this->sourceExtractor = $sourceExtractor;
$this->targetExtractor = $targetExtractor;
$this->targetExtension = $targetExtension;
$this->targetMimeType = $targetMimeType;
$this->needsIntermediateSourceFile($needsIntermediate);
}
/**
* Extracts the content of the document
*
* @return string
*/
public function extractTextContent()
{
$intermediateFile = $this->targetfile . '.' . $this->targetExtension;
touch($intermediateFile);
$this->sourceExtractor->setSourceFile($this->sourcefile);
$this->sourceExtractor->setTargetFile($intermediateFile);
$this->sourceExtractor->setDocument($this->getDocument());
$this->sourceExtractor->setMimeType($this->mimetype);
$this->sourceExtractor->setExtension($this->extension);
if (!$this->sourceExtractor->extractTextContent())
{
$this->output = $this->sourceExtractor->output;
@unlink($intermediateFile);
return false;
}
$intermediateFile = $this->sourceExtractor->getTargetFile();
$this->targetExtractor->setSourceFile($intermediateFile);
$this->targetExtractor->setTargetFile($this->targetfile);
$this->targetExtractor->setDocument($this->getDocument());
$this->targetExtractor->setMimeType($this->targetMimeType);
$this->targetExtractor->setExtension($this->targetExtension);
$result = $this->targetExtractor->extractTextContent();
if (!$result)
{
$this->output = $this->targetExtractor->output;
}
@unlink($intermediateFile);
$this->setTargetFile($this->targetExtractor->getTargetFile());
return $result;
}
/**
* Diagnose the extractors
*
* @return mixed
*/
public function diagnose()
{
$diagnosis = $this->sourceExtractor->diagnose();
if (!empty($diagnosis))
{
return $diagnosis;
}
$diagnosis = $this->targetExtractor->diagnose();
if (!empty($diagnosis))
{
return $diagnosis;
}
return null;
}
}
/**
* The purpose of an extractor hook is to effect the
*
*/
abstract class ExtractorHook
{
/**
* Returns an array of supported mime types.
* e.g. return array('plain/text');
*
*
* @return array
*
*/
public abstract function getSupportedMimeTypes();
/**
* Returns the friendly name for the hook.
*
* @return string
*/
public abstract function getDisplayName();
/**
* This does a basic diagnosis on the hook.
*
* @return string
*/
public function diagnose()
{
return null;
}
/**
* Perform any pre extraction activities.
*
* @param DocumentExtractor $extractor
*/
public function pre_extract($extractor)
{
}
/**
* Perform any post extraction activities.
*
* @param DocumentExtractor $extractor
*/
public function post_extract($extractor)
{
}
/**
* Perform any pre indexing activities.
*
* @param DocumentExtractor $extractor
*/
public function pre_index($extractor)
{
}
/**
* Perform any post indexing activities.
*
* @param DocumentExtractor $extractor
*/
public function post_index($extractor)
{
}
}
?>