Incam_SGD/search2/indexing/extractorCore.inc.php

<?php

/**
 * $Id:$
 *
 * KnowledgeTree Community Edition
 * Document Management Made Simple
 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
 * 
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License version 3 as published by the
 * Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
 * California 94120-7775, or email info@knowledgetree.com.
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU General Public License version 3.
 *
 * In accordance with Section 7(b) of the GNU General Public License version 3,
 * these Appropriate Legal Notices must retain the display of the "Powered by
 * KnowledgeTree" logo and retain the original copyright notice. If the display of the
 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
 * must display the words "Powered by KnowledgeTree" and retain the original
 * copyright notice.
 * Contributor( s): ______________________________________
 *
 */

/**
 * DocumentExtractor is the base class for all text extractors.
 *
 */
abstract class DocumentExtractor
{
	/**
	 * The source filename from which to extract text.
	 *
	 * @var string
	 */
	protected $sourcefile;

	/**
	 * The target filename, where the extracted text must be stored.
	 *
	 * @var string
	 */
	protected $targetfile;

	/**
	 * The mime type of the source file.
	 *
	 * @var string
	 */
	protected $mimetype;

	/**
	 * The extension of the source file.
	 *
	 * @var string
	 */
	protected $extension;

	/**
	 * Reference to the document being indexed.
	 *
	 * @var Document
	 */
	protected $document;

	/**
	 * Indicates if the extractor needs an intermediate file or not.
	 * Generally the source file will be a file within the respository itself. Some extractors may
	 * require the source file to have the correct extension. Setting this to true will result in
	 * a file being created with the extension of the file. It is ideal to disable this if possible.
	 *
	 * @var boolean
	 */
	protected $needsIntermediate;

	/**
	 * The status of the extraction. If null, the extraction has not been done yet.
	 *
	 * @var boolean
	 */
	protected $extractionStatus;

	/**
	 * The status of the indexing. If null, the indexing has not been done yet.
	 *
	 * @var boolean
	 */
	protected $indexStatus;

	/**
	 * If an error occurred, this is the output that was captured
	 *
	 * @var string
	 */
	public $output;


	public function __construct()
	{
		$this->needsIntermediate=false;
		$this->extractionStatus = null;
		$this->indexStatus = null;
	}

	/**
	 * Sets the status of the indexing.
	 *
	 * @param unknown_type $status
	 */
	public function setIndexingStatus($status)
	{
		$this->indexStatus = $status;
	}
	/**
	 * Returns the indexing status.
	 *
	 * @return boolean
	 */
	public function getIndexingStatus()
	{
		return $this->indexStatus;
	}

	/**
	 * Sets the extraction status.
	 *
	 * @param boolean $status
	 */
	public function setExtractionStatus($status)
	{
		$this->extractionStatus = $status;
	}
	/**
	 * Return the extraction status.
	 *
	 * @return boolean
	 */
	public function getExtractionStatus()
	{
		return $this->extractionStatus;
	}

	/**
	 * This associates all the mime types associated with the extractor class.
	 *
	 */
	public function registerMimeTypes()
	{
		$types = $this->getSupportedMimeTypes();
		if (empty($types))
		{
			return;
		}
		$classname=get_class($this);

		$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
		$rs = DBUtil::getResultArray($sql);
		if (count($rs) == 0)
		{
			$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
		}
		else
		{
			$extractor_id = $rs[0]['extractor_id'];
		}


		foreach($types as $type)
		{
			$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
			$rs = DBUtil::runQuery($sql);
		}
	}

	/**
	 * Indicates if an intermediate file is required.
	 *
	 * @param $value boolean Optional. If set, we set the value.
	 * @return boolean
	 */
	public function needsIntermediateSourceFile($value = null)
	{
		if (!is_null($value))
		{
			$this->needsIntermediate = $value;
		}
		return $this->needsIntermediate;
	}

	/**
	 * Sets the source filename for the document extractor.
	 *
	 * @param string $sourcefile
	 */
	public function setSourceFile($sourcefile)
	{
		$this->sourcefile=$sourcefile;
	}

	/**
	 * Returns the source file name.
	 *
	 * @return string
	 */
	public function getSourceFile() { return $this->sourcefile; }

	/**
	 * Sets the source file's mime type.
	 *
	 * @param string $mimetype
	 */
	public function setMimeType($mimetype)
	{
		$this->mimetype=$mimetype;
	}
	/**
	 * Returns the mime type for the source file.
	 *
	 * @return string
	 */
	public function getMimeType() { return $this->mimetype; }

	/**
	 * Indicates the extension for the source file.
	 *
	 * @param string $extension
	 */
	public function setExtension($extension)
	{
		$this->extension=$extension;
	}
	/**
	 * Returns the extension of the source file.
	 *
	 * @return string
	 */
	public function getExtension() { return $this->extension; }

	/**
	 * Sets the file name of the target text file.
	 *
	 * @param string $targetfile
	 */
	public function setTargetFile($targetfile)
	{
		$this->targetfile=$targetfile;
	}

	/**
	 * Gets the file name of the target text file containing the extracted text.
	 *
	 * @return unknown
	 */
	public function getTargetFile() { return $this->targetfile; }

	/**
	 * Filter function that may be applied after extraction. This may be overridden.
	 *
	 * @param string $text
	 * @return string
	 */
	protected function filter($text)
	{
		return $text;
	}

	/**
	 * Set the document that will be indexed.
	 *
	 * @param Document $document
	 */
	public function setDocument($document)
	{
		$this->document = $document;
	}

	/**
	 * Returns a reference to the document.
	 *
	 * @return string
	 */
	public function getDocument()
	{
		return $this->document;
	}

	/**
	 * Returns an array of supported mime types.
	 * e.g. return array('plain/text');
	 *
	 *
	 * @return array
	 *
	 */
	public abstract function getSupportedMimeTypes();

	/**
	 * Extracts the content from the source file.
	 *
	 * @return boolean
	 */
	public abstract function extractTextContent();

	/**
	 * Returns a friendly name for the document text extractor.
	 *
	 * @return string
	 */
	public abstract function getDisplayName();

	/**
	 * Attempts to diagnose any problems with the indexing process.
	 *
	 * @return string
	 */
	public abstract function diagnose();

}

/**
 * This class extends the document extractor to execute some command line application.
 * The getCommandLine() method needs to be overridden.
 *
 */
abstract class ExternalDocumentExtractor extends DocumentExtractor
{
    protected $allowOutput = false;
    protected $pipeStdoutToDevNull = false;

	/**
	 * Initialise the extractor.
	 *
	 */
	public function __construct()
	{
		parent::__construct();
		putenv('LANG=en_US.UTF-8');

		$config = KTConfig::getSingleton();

		$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));

		putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));
	}

	public function setAllowOutput($allowOutput)
	{
	    $this->allowOutput = $allowOutput;
	}

	/**
	 * Executes a command. Returns true if successful.
	 *
	 * @param string $cmd A command line instruction.
	 * @return boolean
	 */
	protected  function exec($cmd)
	{
		$config = KTConfig::getSingleton();
		$temp_dir = $config->get('urls/tmpDirectory');
		$res = 0;

		$docid = $this->document->getId();

		$script_prefix = $temp_dir . '/' . time() . '-' . $docid;
		$script_out = $script_prefix . '.out';

		// define the scripts that we want

		if (OS_WINDOWS)
		{
			$script_name = $script_prefix . '.bat';

			$script = "rem This is an auto generated file. \n";
			$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";
			$script .= "set er=%ERRORLEVEL%\r\n";
			$script .= "exit /B %er%\r\n";
		}
		else
		{
			$script_name = $script_prefix . '.sh';

			$script = "#!/bin/sh\n";
			$script .= "# This is an auto generated file. \n";
			$script .= $cmd . ' 2>>"' . $script_out . "\"";

			if ($this->pipeStdoutToDevNull)
			{
			    $script .= " >/dev/null";
			}

			$script .= "\n";

			$script .= "exit $?\n";
		}

		// write the script file
		if (file_put_contents($script_name, $script) === false)
		{
			$this->output = _kt('Could not create exec script: ') . $script_name;
			return false;
		}

		// execute the script file
		if (OS_WINDOWS)
		{
			$res = KTUtil::pexec("\"$script_name\"");
			$res = $res['ret'];
		}
		else
		{
			if (chmod($script_name, 0755) === false)
			{
				$this->output = _kt('Could change permission on exec script: ') . $script_name;
				return false;
			}
			system($script_name, $res);
		}

		// remote the script file and get the output if available
		@unlink($script_name);

		if (file_exists($script_out))
		{
			$this->output = file_get_contents($script_out);
			@unlink($script_out);
		}

		return ($res == 0) && (empty($this->output) || $this->allowOutput);
	}

	/**
	 * Returns the command line string to be executed.
	 * The command returned should include the target filename.
	 *
	 * @return string
	 */
	protected function getCommandLine()
	{
		throw new Exception(_kt('getCommandLine is not implemented'));
	}

	/**
	 * Executes the command that executes the command.
	 * Returns true if success.
	 *
	 * @return boolean
	 */
	public function extractTextContent()
	{
		global $default;

		$cmdline = $this->getCommandLine();

		$class = get_class($this);
		$default->log->debug("$class: "  . $cmdline);

		return $this->exec($cmdline);
	}

}

abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor
{
    protected $cmd;
    protected $params;

    /**
     * Enter description here...
     *
     * @var StarOfficeExtractor
     */
    protected $oo;

    public function __construct($cmd, $params)
    {
        parent::__construct();
        $this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);

        $config = KTConfig::getSingleton();
        $this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);
        $this->useOO = false; //$config->get('indexer/useOpenOffice', true);
        if (!$config->get('indexer/use_' . $cmd, true) || OS_WINDOWS)
        {
            $this->cmd = false;
        }

        if ($this->useOO)
        {
//            require_once('extractors/StarOfficeExtractor.inc.php');
//            $this->oo = new StarOfficeExtractor();
        }
    }

	public function needsIntermediateSourceFile()
	{
		// we need the intermediate file because it
		// has the correct extension. documentConverter uses the extension to determine mimetype

		return ($this->useOO);
	}

	protected function getCommandLine()
	{
		$sourcefile = $this->sourcefile;
		$targetfile = $this->targetfile;
		$escape = '"';

        $cmd = $this->cmd;

		$cmdline = $this->params;
		$cmdline = eval("return \"$cmdline\";");

		$cmdline = str_replace('\\','/',$cmdline);

		return $cmdline;
	}


    public function extractTextContent()
    {
        if ($this->cmd !== false)
        {
            // so we have catppt or something
            $result = parent::extractTextContent();
            if ($result !== false)
            {
                // if it returns true, we can bail
                return true;
            }

            // if failure, fallthrough, and attempt OO
        }

        /*
        if ($this->useOO)
        {
            $this->oo->setSourceFile($this->sourcefile);
            $this->oo->setMimeType($this->mimetype);
            $this->oo->setExtension($this->extension);
            $this->oo->setTargetFile($this->targetfile);
            $this->oo->setDocument($this->document);
            $this->oo->setIndexingStatus(null);
            $this->oo->setExtractionStatus(null);

            $result = $this->oo->extractTextContent();

            $this->setIndexingStatus($this->oo->getIndexingStatus());
            $this->setExtractionStatus($this->oo->getExtractionStatus());
            $this->setTargetFile($this->oo->getTargetFile());

            return $result;
        }
        else
        {
        */
            global $default;
            $docId = $this->document->getId();
            $cmd = $this->cmd;
            $default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");
            file_put_contents($this->targetfile, '');
            return true;
        //}
    }

    public function diagnose()
    {
        if ($this->cmd !== false) // || !$this->useOO)
        {
            // cmd is found. we don't care about oo.
            // if we can't use oo, well, not much we can do....
            return null;
        }

        return false; //$this->oo->diagnose();
    }
}

/**
 * An extension to the extenal document extractor. A derived class simply needs
 * to implement a constructor and getSupportedMimeTypes().
 *
 */
abstract class ApplicationExtractor extends ExternalDocumentExtractor
{
	/**
	 * The full path to the application that will be run. This will be resolved from
	 * the path or using the config file.
	 *
	 * @var string
	 */
	private $application;
	/**
	 * The command name of the application that can be run.
	 *
	 * @var string
	 */
	private $command;
	/**
	 * This is the friendly name for the extractor.
	 *
	 * @var string
	 */
	private $displayname;
	/**
	 * The command line parameters for the application.
	 * This may include {source} and {target} where substitutions will be done.
	 *
	 * @var string
	 */
	private $params;

	/**
	 * Initialise the extractor.
	 *
	 * @param string $section The section in the config file.
	 * @param string $appname The application name in the config file.
	 * @param string $command The command that can be run.
	 * @param string $displayname
	 * @param string $params
	 */
	public function __construct($section, $appname, $command, $displayname, $params)
	{
		parent::__construct();

		$this->application = KTUtil::findCommand("$section/$appname", $command);
		$this->command = $command;
		$this->displayname = $displayname;
		$this->params = $params;
	}

	/**
	 * Return the display name.
	 *
	 * @return string
	 */
	public function getDisplayName()
	{
		return sprintf(_kt('%s') , $this->displayname);
	}

	/**
	 * Returns the command line after performing substitutions.
	 *
	 * @return unknown
	 */
	protected function getCommandLine()
	{
		$sources = array('{source}','{target}');
		$target = array($this->sourcefile, $this->targetfile);
		$escape = OS_WINDOWS?'"':'\'';
		$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);

		return $cmdline;
	}

	/**
	 * Identifies if there are any circumstances why the command can not run that could result in the text extraction process
	 * failing.
	 *
	 * @return mixed Returns string if there is a problem, null otherwise.
	 */
	public function diagnose()
	{
		if (false === $this->application)
		{
			return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);
		}

		return null;
	}
}

abstract class TextExtractor extends DocumentExtractor
{
	/**
	 * This extracts the text from the document.
	 *
	 * @return boolean
	 */
	public function extractTextContent()
	{
	    $config = KTConfig::getSingleton();
		$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
		$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);
		if (false === $content)
		{
			return false;
		}

		$result = file_put_contents($this->targetfile, $this->filter($content));

		return false !== $result;
	}

	/**
	 * There are no external dependancies to diagnose.
	 *
	 * @return null
	 */
	public function diagnose()
	{
		return null;
	}

}

/**
 * The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.
 *
 */
abstract class CompositeExtractor extends DocumentExtractor
{
	/**
	 * The initial extractor
	 *
	 * @var DocumentExtractor
	 */
	private $sourceExtractor;
	/**
	 * The text extractor
	 *
	 * @var DocumentExtractor
	 */
	private $targetExtractor;
	/**
	 * The extension for the initial extraction
	 *
	 * @var string
	 */
	private $targetExtension;
	/**
	 * The mime type of the initial extraction.
	 *
	 * @var string
	 */
	private $targetMimeType;

	public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)
	{
		$this->sourceExtractor = $sourceExtractor;
		$this->targetExtractor = $targetExtractor;
		$this->targetExtension = $targetExtension;
		$this->targetMimeType = $targetMimeType;
		$this->needsIntermediateSourceFile($needsIntermediate);
	}

	/**
	 * Extracts the content of the document
	 *
	 * @return string
	 */
	public function extractTextContent()
	{
		$intermediateFile = $this->targetfile . '.' . $this->targetExtension;
		touch($intermediateFile);

		$this->sourceExtractor->setSourceFile($this->sourcefile);
		$this->sourceExtractor->setTargetFile($intermediateFile);
		$this->sourceExtractor->setDocument($this->getDocument());
		$this->sourceExtractor->setMimeType($this->mimetype);
		$this->sourceExtractor->setExtension($this->extension);
		if (!$this->sourceExtractor->extractTextContent())
		{
			$this->output = $this->sourceExtractor->output;
			@unlink($intermediateFile);
			return false;
		}
		$intermediateFile = $this->sourceExtractor->getTargetFile();

		$this->targetExtractor->setSourceFile($intermediateFile);
		$this->targetExtractor->setTargetFile($this->targetfile);
		$this->targetExtractor->setDocument($this->getDocument());
		$this->targetExtractor->setMimeType($this->targetMimeType);
		$this->targetExtractor->setExtension($this->targetExtension);
		$result = $this->targetExtractor->extractTextContent();
		if (!$result)
		{
			$this->output = $this->targetExtractor->output;
		}

		@unlink($intermediateFile);
		$this->setTargetFile($this->targetExtractor->getTargetFile());

		return $result;
	}

	/**
	 * Diagnose the extractors
	 *
	 * @return mixed
	 */
	public function diagnose()
	{
		$diagnosis = $this->sourceExtractor->diagnose();
		if (!empty($diagnosis))
		{
			return $diagnosis;
		}

		$diagnosis = $this->targetExtractor->diagnose();
		if (!empty($diagnosis))
		{
			return $diagnosis;
		}

		return null;
	}
}


/**
 * The purpose of an extractor hook is to effect the
 *
 */
abstract class ExtractorHook
{
	/**
	 * Returns an array of supported mime types.
	 * e.g. return array('plain/text');
	 *
	 *
	 * @return array
	 *
	 */
	public abstract function getSupportedMimeTypes();

	/**
	 * Returns the friendly name for the hook.
	 *
	 * @return string
	 */
	public abstract function getDisplayName();

	/**
	 * This does a basic diagnosis on the hook.
	 *
	 * @return string
	 */
	public function diagnose()
	{
		return null;
	}

	/**
	 * Perform any pre extraction activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function pre_extract($extractor)
	{
	}

	/**
	 * Perform any post extraction activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function post_extract($extractor)
	{

	}

	/**
	 * Perform any pre indexing activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function pre_index($extractor)
	{

	}

	/**
	 * Perform any post indexing activities.
	 *
	 * @param DocumentExtractor $extractor
	 */
	public function post_index($extractor)
	{

	}
}

?>
Importación inicial con versión 3.7.0.2 original git-svn-id: https://192.168.0.254/svn/Proyectos.Incam_SGD/tags/3.7.0.2_original@1 eb19766c-00d9-a042-a3a0-45cb8ec72764 2010-09-10 16:45:26 +00:00			`<?php`

			`/**`
			`* $Id:$`
			`*`
			`* KnowledgeTree Community Edition`
			`* Document Management Made Simple`
			`* Copyright (C) 2008, 2009 KnowledgeTree Inc.`
			`*`
			`*`
			`* This program is free software; you can redistribute it and/or modify it under`
			`* the terms of the GNU General Public License version 3 as published by the`
			`* Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful, but WITHOUT`
			`* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS`
			`* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more`
			`* details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*`
			`* You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,`
			`* California 94120-7775, or email info@knowledgetree.com.`
			`*`
			`* The interactive user interfaces in modified source and object code versions`
			`* of this program must display Appropriate Legal Notices, as required under`
			`* Section 5 of the GNU General Public License version 3.`
			`*`
			`* In accordance with Section 7(b) of the GNU General Public License version 3,`
			`* these Appropriate Legal Notices must retain the display of the "Powered by`
			`* KnowledgeTree" logo and retain the original copyright notice. If the display of the`
			`* logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices`
			`* must display the words "Powered by KnowledgeTree" and retain the original`
			`* copyright notice.`
			`* Contributor( s): ______________________________________`
			`*`
			`*/`

			`/**`
			`* DocumentExtractor is the base class for all text extractors.`
			`*`
			`*/`
			`abstract class DocumentExtractor`
			`{`
			`/**`
			`* The source filename from which to extract text.`
			`*`
			`* @var string`
			`*/`
			`protected $sourcefile;`

			`/**`
			`* The target filename, where the extracted text must be stored.`
			`*`
			`* @var string`
			`*/`
			`protected $targetfile;`

			`/**`
			`* The mime type of the source file.`
			`*`
			`* @var string`
			`*/`
			`protected $mimetype;`

			`/**`
			`* The extension of the source file.`
			`*`
			`* @var string`
			`*/`
			`protected $extension;`

			`/**`
			`* Reference to the document being indexed.`
			`*`
			`* @var Document`
			`*/`
			`protected $document;`

			`/**`
			`* Indicates if the extractor needs an intermediate file or not.`
			`* Generally the source file will be a file within the respository itself. Some extractors may`
			`* require the source file to have the correct extension. Setting this to true will result in`
			`* a file being created with the extension of the file. It is ideal to disable this if possible.`
			`*`
			`* @var boolean`
			`*/`
			`protected $needsIntermediate;`

			`/**`
			`* The status of the extraction. If null, the extraction has not been done yet.`
			`*`
			`* @var boolean`
			`*/`
			`protected $extractionStatus;`

			`/**`
			`* The status of the indexing. If null, the indexing has not been done yet.`
			`*`
			`* @var boolean`
			`*/`
			`protected $indexStatus;`

			`/**`
			`* If an error occurred, this is the output that was captured`
			`*`
			`* @var string`
			`*/`
			`public $output;`


			`public function __construct()`
			`{`
			`$this->needsIntermediate=false;`
			`$this->extractionStatus = null;`
			`$this->indexStatus = null;`
			`}`

			`/**`
			`* Sets the status of the indexing.`
			`*`
			`* @param unknown_type $status`
			`*/`
			`public function setIndexingStatus($status)`
			`{`
			`$this->indexStatus = $status;`
			`}`
			`/**`
			`* Returns the indexing status.`
			`*`
			`* @return boolean`
			`*/`
			`public function getIndexingStatus()`
			`{`
			`return $this->indexStatus;`
			`}`

			`/**`
			`* Sets the extraction status.`
			`*`
			`* @param boolean $status`
			`*/`
			`public function setExtractionStatus($status)`
			`{`
			`$this->extractionStatus = $status;`
			`}`
			`/**`
			`* Return the extraction status.`
			`*`
			`* @return boolean`
			`*/`
			`public function getExtractionStatus()`
			`{`
			`return $this->extractionStatus;`
			`}`

			`/**`
			`* This associates all the mime types associated with the extractor class.`
			`*`
			`*/`
			`public function registerMimeTypes()`
			`{`
			`$types = $this->getSupportedMimeTypes();`
			`if (empty($types))`
			`{`
			`return;`
			`}`
			`$classname=get_class($this);`

			`$sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";`
			`$rs = DBUtil::getResultArray($sql);`
			`if (count($rs) == 0)`
			`{`
			`$extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));`
			`}`
			`else`
			`{`
			`$extractor_id = $rs[0]['extractor_id'];`
			`}`


			`foreach($types as $type)`
			`{`
			`$sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";`
			`$rs = DBUtil::runQuery($sql);`
			`}`
			`}`

			`/**`
			`* Indicates if an intermediate file is required.`
			`*`
			`* @param $value boolean Optional. If set, we set the value.`
			`* @return boolean`
			`*/`
			`public function needsIntermediateSourceFile($value = null)`
			`{`
			`if (!is_null($value))`
			`{`
			`$this->needsIntermediate = $value;`
			`}`
			`return $this->needsIntermediate;`
			`}`

			`/**`
			`* Sets the source filename for the document extractor.`
			`*`
			`* @param string $sourcefile`
			`*/`
			`public function setSourceFile($sourcefile)`
			`{`
			`$this->sourcefile=$sourcefile;`
			`}`

			`/**`
			`* Returns the source file name.`
			`*`
			`* @return string`
			`*/`
			`public function getSourceFile() { return $this->sourcefile; }`

			`/**`
			`* Sets the source file's mime type.`
			`*`
			`* @param string $mimetype`
			`*/`
			`public function setMimeType($mimetype)`
			`{`
			`$this->mimetype=$mimetype;`
			`}`
			`/**`
			`* Returns the mime type for the source file.`
			`*`
			`* @return string`
			`*/`
			`public function getMimeType() { return $this->mimetype; }`

			`/**`
			`* Indicates the extension for the source file.`
			`*`
			`* @param string $extension`
			`*/`
			`public function setExtension($extension)`
			`{`
			`$this->extension=$extension;`
			`}`
			`/**`
			`* Returns the extension of the source file.`
			`*`
			`* @return string`
			`*/`
			`public function getExtension() { return $this->extension; }`

			`/**`
			`* Sets the file name of the target text file.`
			`*`
			`* @param string $targetfile`
			`*/`
			`public function setTargetFile($targetfile)`
			`{`
			`$this->targetfile=$targetfile;`
			`}`

			`/**`
			`* Gets the file name of the target text file containing the extracted text.`
			`*`
			`* @return unknown`
			`*/`
			`public function getTargetFile() { return $this->targetfile; }`

			`/**`
			`* Filter function that may be applied after extraction. This may be overridden.`
			`*`
			`* @param string $text`
			`* @return string`
			`*/`
			`protected function filter($text)`
			`{`
			`return $text;`
			`}`

			`/**`
			`* Set the document that will be indexed.`
			`*`
			`* @param Document $document`
			`*/`
			`public function setDocument($document)`
			`{`
			`$this->document = $document;`
			`}`

			`/**`
			`* Returns a reference to the document.`
			`*`
			`* @return string`
			`*/`
			`public function getDocument()`
			`{`
			`return $this->document;`
			`}`

			`/**`
			`* Returns an array of supported mime types.`
			`* e.g. return array('plain/text');`
			`*`
			`*`
			`* @return array`
			`*`
			`*/`
			`public abstract function getSupportedMimeTypes();`

			`/**`
			`* Extracts the content from the source file.`
			`*`
			`* @return boolean`
			`*/`
			`public abstract function extractTextContent();`

			`/**`
			`* Returns a friendly name for the document text extractor.`
			`*`
			`* @return string`
			`*/`
			`public abstract function getDisplayName();`

			`/**`
			`* Attempts to diagnose any problems with the indexing process.`
			`*`
			`* @return string`
			`*/`
			`public abstract function diagnose();`

			`}`

			`/**`
			`* This class extends the document extractor to execute some command line application.`
			`* The getCommandLine() method needs to be overridden.`
			`*`
			`*/`
			`abstract class ExternalDocumentExtractor extends DocumentExtractor`
			`{`
			`protected $allowOutput = false;`
			`protected $pipeStdoutToDevNull = false;`

			`/**`
			`* Initialise the extractor.`
			`*`
			`*/`
			`public function __construct()`
			`{`
			`parent::__construct();`
			`putenv('LANG=en_US.UTF-8');`

			`$config = KTConfig::getSingleton();`

			`$default = realpath(str_replace('\\','/',KT_DIR . '/../openoffice/program'));`

			`putenv('ooProgramPath=' . $config->get('openoffice/programPath', $default));`
			`}`

			`public function setAllowOutput($allowOutput)`
			`{`
			`$this->allowOutput = $allowOutput;`
			`}`

			`/**`
			`* Executes a command. Returns true if successful.`
			`*`
			`* @param string $cmd A command line instruction.`
			`* @return boolean`
			`*/`
			`protected function exec($cmd)`
			`{`
			`$config = KTConfig::getSingleton();`
			`$temp_dir = $config->get('urls/tmpDirectory');`
			`$res = 0;`

			`$docid = $this->document->getId();`

			`$script_prefix = $temp_dir . '/' . time() . '-' . $docid;`
			`$script_out = $script_prefix . '.out';`

			`// define the scripts that we want`

			`if (OS_WINDOWS)`
			`{`
			`$script_name = $script_prefix . '.bat';`

			`$script = "rem This is an auto generated file. \n";`
			`$script .= $cmd . ' 2>"' . $script_out . "\"\r\n";`
			`$script .= "set er=%ERRORLEVEL%\r\n";`
			`$script .= "exit /B %er%\r\n";`
			`}`
			`else`
			`{`
			`$script_name = $script_prefix . '.sh';`

			`$script = "#!/bin/sh\n";`
			`$script .= "# This is an auto generated file. \n";`
			`$script .= $cmd . ' 2>>"' . $script_out . "\"";`

			`if ($this->pipeStdoutToDevNull)`
			`{`
			`$script .= " >/dev/null";`
			`}`

			`$script .= "\n";`

			`$script .= "exit $?\n";`
			`}`

			`// write the script file`
			`if (file_put_contents($script_name, $script) === false)`
			`{`
			`$this->output = _kt('Could not create exec script: ') . $script_name;`
			`return false;`
			`}`

			`// execute the script file`
			`if (OS_WINDOWS)`
			`{`
			`$res = KTUtil::pexec("\"$script_name\"");`
			`$res = $res['ret'];`
			`}`
			`else`
			`{`
			`if (chmod($script_name, 0755) === false)`
			`{`
			`$this->output = _kt('Could change permission on exec script: ') . $script_name;`
			`return false;`
			`}`
			`system($script_name, $res);`
			`}`

			`// remote the script file and get the output if available`
			`@unlink($script_name);`

			`if (file_exists($script_out))`
			`{`
			`$this->output = file_get_contents($script_out);`
			`@unlink($script_out);`
			`}`

			`return ($res == 0) && (empty($this->output) \|\| $this->allowOutput);`
			`}`

			`/**`
			`* Returns the command line string to be executed.`
			`* The command returned should include the target filename.`
			`*`
			`* @return string`
			`*/`
			`protected function getCommandLine()`
			`{`
			`throw new Exception(_kt('getCommandLine is not implemented'));`
			`}`

			`/**`
			`* Executes the command that executes the command.`
			`* Returns true if success.`
			`*`
			`* @return boolean`
			`*/`
			`public function extractTextContent()`
			`{`
			`global $default;`

			`$cmdline = $this->getCommandLine();`

			`$class = get_class($this);`
			`$default->log->debug("$class: " . $cmdline);`

			`return $this->exec($cmdline);`
			`}`

			`}`

			`abstract class OOFallbackDocumentExtractor extends ExternalDocumentExtractor`
			`{`
			`protected $cmd;`
			`protected $params;`

			`/**`
			`* Enter description here...`
			`*`
			`* @var StarOfficeExtractor`
			`*/`
			`protected $oo;`

			`public function __construct($cmd, $params)`
			`{`
			`parent::__construct();`
			`$this->cmd = KTUtil::findCommand('externalBinary/' . $cmd, false);`

			`$config = KTConfig::getSingleton();`
			`$this->params = $config->get('indexer/' . $cmd . 'cmdline', $params);`
			`$this->useOO = false; //$config->get('indexer/useOpenOffice', true);`
			`if (!$config->get('indexer/use_' . $cmd, true) \|\| OS_WINDOWS)`
			`{`
			`$this->cmd = false;`
			`}`

			`if ($this->useOO)`
			`{`
			`// require_once('extractors/StarOfficeExtractor.inc.php');`
			`// $this->oo = new StarOfficeExtractor();`
			`}`
			`}`

			`public function needsIntermediateSourceFile()`
			`{`
			`// we need the intermediate file because it`
			`// has the correct extension. documentConverter uses the extension to determine mimetype`

			`return ($this->useOO);`
			`}`

			`protected function getCommandLine()`
			`{`
			`$sourcefile = $this->sourcefile;`
			`$targetfile = $this->targetfile;`
			`$escape = '"';`

			`$cmd = $this->cmd;`

			`$cmdline = $this->params;`
			`$cmdline = eval("return \"$cmdline\";");`

			`$cmdline = str_replace('\\','/',$cmdline);`

			`return $cmdline;`
			`}`


			`public function extractTextContent()`
			`{`
			`if ($this->cmd !== false)`
			`{`
			`// so we have catppt or something`
			`$result = parent::extractTextContent();`
			`if ($result !== false)`
			`{`
			`// if it returns true, we can bail`
			`return true;`
			`}`

			`// if failure, fallthrough, and attempt OO`
			`}`

			`/*`
			`if ($this->useOO)`
			`{`
			`$this->oo->setSourceFile($this->sourcefile);`
			`$this->oo->setMimeType($this->mimetype);`
			`$this->oo->setExtension($this->extension);`
			`$this->oo->setTargetFile($this->targetfile);`
			`$this->oo->setDocument($this->document);`
			`$this->oo->setIndexingStatus(null);`
			`$this->oo->setExtractionStatus(null);`

			`$result = $this->oo->extractTextContent();`

			`$this->setIndexingStatus($this->oo->getIndexingStatus());`
			`$this->setExtractionStatus($this->oo->getExtractionStatus());`
			`$this->setTargetFile($this->oo->getTargetFile());`

			`return $result;`
			`}`
			`else`
			`{`
			`*/`
			`global $default;`
			`$docId = $this->document->getId();`
			`$cmd = $this->cmd;`
			`$default->log->info("The document {$docId} cannot be indexed as {$cmd} is not available and OpenOffice is not in use.");`
			`file_put_contents($this->targetfile, '');`
			`return true;`
			`//}`
			`}`

			`public function diagnose()`
			`{`
			`if ($this->cmd !== false) // \|\| !$this->useOO)`
			`{`
			`// cmd is found. we don't care about oo.`
			`// if we can't use oo, well, not much we can do....`
			`return null;`
			`}`

			`return false; //$this->oo->diagnose();`
			`}`
			`}`

			`/**`
			`* An extension to the extenal document extractor. A derived class simply needs`
			`* to implement a constructor and getSupportedMimeTypes().`
			`*`
			`*/`
			`abstract class ApplicationExtractor extends ExternalDocumentExtractor`
			`{`
			`/**`
			`* The full path to the application that will be run. This will be resolved from`
			`* the path or using the config file.`
			`*`
			`* @var string`
			`*/`
			`private $application;`
			`/**`
			`* The command name of the application that can be run.`
			`*`
			`* @var string`
			`*/`
			`private $command;`
			`/**`
			`* This is the friendly name for the extractor.`
			`*`
			`* @var string`
			`*/`
			`private $displayname;`
			`/**`
			`* The command line parameters for the application.`
			`* This may include {source} and {target} where substitutions will be done.`
			`*`
			`* @var string`
			`*/`
			`private $params;`

			`/**`
			`* Initialise the extractor.`
			`*`
			`* @param string $section The section in the config file.`
			`* @param string $appname The application name in the config file.`
			`* @param string $command The command that can be run.`
			`* @param string $displayname`
			`* @param string $params`
			`*/`
			`public function __construct($section, $appname, $command, $displayname, $params)`
			`{`
			`parent::__construct();`

			`$this->application = KTUtil::findCommand("$section/$appname", $command);`
			`$this->command = $command;`
			`$this->displayname = $displayname;`
			`$this->params = $params;`
			`}`

			`/**`
			`* Return the display name.`
			`*`
			`* @return string`
			`*/`
			`public function getDisplayName()`
			`{`
			`return sprintf(_kt('%s') , $this->displayname);`
			`}`

			`/**`
			`* Returns the command line after performing substitutions.`
			`*`
			`* @return unknown`
			`*/`
			`protected function getCommandLine()`
			`{`
			`$sources = array('{source}','{target}');`
			`$target = array($this->sourcefile, $this->targetfile);`
			`$escape = OS_WINDOWS?'"':'\'';`
			`$cmdline = $escape . $this->application . $escape . ' ' . str_replace($sources,$target, $this->params);`

			`return $cmdline;`
			`}`

			`/**`
			`* Identifies if there are any circumstances why the command can not run that could result in the text extraction process`
			`* failing.`
			`*`
			`* @return mixed Returns string if there is a problem, null otherwise.`
			`*/`
			`public function diagnose()`
			`{`
			`if (false === $this->application)`
			`{`
			`return sprintf(_kt("Cannot locate binary for %s (%s)."), $this->displayname, $this->command);`
			`}`

			`return null;`
			`}`
			`}`

			`abstract class TextExtractor extends DocumentExtractor`
			`{`
			`/**`
			`* This extracts the text from the document.`
			`*`
			`* @return boolean`
			`*/`
			`public function extractTextContent()`
			`{`
			`$config = KTConfig::getSingleton();`
			`$maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default`
			`$content = file_get_contents($this->sourcefile, null, null, null, $maxTextSize);`
			`if (false === $content)`
			`{`
			`return false;`
			`}`

			`$result = file_put_contents($this->targetfile, $this->filter($content));`

			`return false !== $result;`
			`}`

			`/**`
			`* There are no external dependancies to diagnose.`
			`*`
			`* @return null`
			`*/`
			`public function diagnose()`
			`{`
			`return null;`
			`}`

			`}`

			`/**`
			`* The composite extractor implies that a conversion is done to an intermediate form before another extractor is run.`
			`*`
			`*/`
			`abstract class CompositeExtractor extends DocumentExtractor`
			`{`
			`/**`
			`* The initial extractor`
			`*`
			`* @var DocumentExtractor`
			`*/`
			`private $sourceExtractor;`
			`/**`
			`* The text extractor`
			`*`
			`* @var DocumentExtractor`
			`*/`
			`private $targetExtractor;`
			`/**`
			`* The extension for the initial extraction`
			`*`
			`* @var string`
			`*/`
			`private $targetExtension;`
			`/**`
			`* The mime type of the initial extraction.`
			`*`
			`* @var string`
			`*/`
			`private $targetMimeType;`

			`public function __construct($sourceExtractor, $targetExtension, $targetMimeType, $targetExtractor, $needsIntermediate)`
			`{`
			`$this->sourceExtractor = $sourceExtractor;`
			`$this->targetExtractor = $targetExtractor;`
			`$this->targetExtension = $targetExtension;`
			`$this->targetMimeType = $targetMimeType;`
			`$this->needsIntermediateSourceFile($needsIntermediate);`
			`}`

			`/**`
			`* Extracts the content of the document`
			`*`
			`* @return string`
			`*/`
			`public function extractTextContent()`
			`{`
			`$intermediateFile = $this->targetfile . '.' . $this->targetExtension;`
			`touch($intermediateFile);`

			`$this->sourceExtractor->setSourceFile($this->sourcefile);`
			`$this->sourceExtractor->setTargetFile($intermediateFile);`
			`$this->sourceExtractor->setDocument($this->getDocument());`
			`$this->sourceExtractor->setMimeType($this->mimetype);`
			`$this->sourceExtractor->setExtension($this->extension);`
			`if (!$this->sourceExtractor->extractTextContent())`
			`{`
			`$this->output = $this->sourceExtractor->output;`
			`@unlink($intermediateFile);`
			`return false;`
			`}`
			`$intermediateFile = $this->sourceExtractor->getTargetFile();`

			`$this->targetExtractor->setSourceFile($intermediateFile);`
			`$this->targetExtractor->setTargetFile($this->targetfile);`
			`$this->targetExtractor->setDocument($this->getDocument());`
			`$this->targetExtractor->setMimeType($this->targetMimeType);`
			`$this->targetExtractor->setExtension($this->targetExtension);`
			`$result = $this->targetExtractor->extractTextContent();`
			`if (!$result)`
			`{`
			`$this->output = $this->targetExtractor->output;`
			`}`

			`@unlink($intermediateFile);`
			`$this->setTargetFile($this->targetExtractor->getTargetFile());`

			`return $result;`
			`}`

			`/**`
			`* Diagnose the extractors`
			`*`
			`* @return mixed`
			`*/`
			`public function diagnose()`
			`{`
			`$diagnosis = $this->sourceExtractor->diagnose();`
			`if (!empty($diagnosis))`
			`{`
			`return $diagnosis;`
			`}`

			`$diagnosis = $this->targetExtractor->diagnose();`
			`if (!empty($diagnosis))`
			`{`
			`return $diagnosis;`
			`}`

			`return null;`
			`}`
			`}`


			`/**`
			`* The purpose of an extractor hook is to effect the`
			`*`
			`*/`
			`abstract class ExtractorHook`
			`{`
			`/**`
			`* Returns an array of supported mime types.`
			`* e.g. return array('plain/text');`
			`*`
			`*`
			`* @return array`
			`*`
			`*/`
			`public abstract function getSupportedMimeTypes();`

			`/**`
			`* Returns the friendly name for the hook.`
			`*`
			`* @return string`
			`*/`
			`public abstract function getDisplayName();`

			`/**`
			`* This does a basic diagnosis on the hook.`
			`*`
			`* @return string`
			`*/`
			`public function diagnose()`
			`{`
			`return null;`
			`}`

			`/**`
			`* Perform any pre extraction activities.`
			`*`
			`* @param DocumentExtractor $extractor`
			`*/`
			`public function pre_extract($extractor)`
			`{`
			`}`

			`/**`
			`* Perform any post extraction activities.`
			`*`
			`* @param DocumentExtractor $extractor`
			`*/`
			`public function post_extract($extractor)`
			`{`

			`}`

			`/**`
			`* Perform any pre indexing activities.`
			`*`
			`* @param DocumentExtractor $extractor`
			`*/`
			`public function pre_index($extractor)`
			`{`

			`}`

			`/**`
			`* Perform any post indexing activities.`
			`*`
			`* @param DocumentExtractor $extractor`
			`*/`
			`public function post_index($extractor)`
			`{`

			`}`
			`}`

			`?>`