.
*
* You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
* California 94120-7775, or email info@knowledgetree.com.
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU General Public License version 3.
*
* In accordance with Section 7(b) of the GNU General Public License version 3,
* these Appropriate Legal Notices must retain the display of the "Powered by
* KnowledgeTree" logo and retain the original copyright notice. If the display of the
* logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
* must display the words "Powered by KnowledgeTree" and retain the original
* copyright notice.
* Contributor( s): ______________________________________
*
*/
define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
require_once('indexing/extractorCore.inc.php');
require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
require_once(KT_DIR . '/ktapi/ktapi.inc.php');
class IndexerInconsistencyException extends Exception {};
// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
class QueryResultItem
{
protected $id;
protected $title;
protected $rank;
protected $text;
protected $fullpath;
public function __construct($id, $title, $rank, $text, $fullpath)
{
$this->id = $id;
$this->title = $title;
$this->rank = $rank;
$this->text = $text;
$this->fullpath = $fullpath;
}
public function getId() { return $this->id; }
public function getRealId() { return $this->id; }
public function getIsProxy() { return $this instanceof ProxyResultItem; }
public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
public function setRank($value)
{
$this->rank = number_format($value,2,'.',',');
}
public function getIsLive()
{
return true;
}
public function setTitle($value)
{
$this->title = $value;
}
public function setText($value)
{
$this->text = $value;
}
public function getRelevance() { return (float) $this->rank; }
public function getRank() { return $this->getRelevance(); }
public function getText() { return (string) $this->text; }
public function getTitle() { return (string) $this->title; }
public function getFullPath() { return (string) $this->fullpath; }
protected function __get($property)
{
if (empty($property))
{
return '';
}
$method = 'get' . $property;
if (method_exists($this, $method))
{
return $this->$method();
}
return $this->getUnknown();
}
protected function getUnknown()
{
return _kt('n/a');
}
protected function __set($property, $value)
{
if (empty($property))
{
return '';
}
$method = 'set' . $property;
if (method_exists($this, $method))
{
return $this->$method($value);
}
throw new Exception("Unknown property '$property' to set on QueryResultItem");
}
}
class ProxyResultItem extends QueryResultItem
{
protected $proxy;
protected $proxyId;
public function __construct($proxyId, $proxy)
{
parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
$this->proxyId = $proxyId;
$this->proxy = $proxy;
}
public function getId() { return $this->proxyId; }
public function getTitle() { return $this->proxy->getTitle(); }
public function getRealId() { return $this->proxy->getId(); }
protected function __get($property)
{
$method = 'get' . $property;
if (method_exists($this, $method))
{
return $this->$method();
}
else
{
return $this->proxy->$method();
}
}
protected function __set($property, $value)
{
$method = 'set' . $property;
if (method_exists($this, $method))
{
return $this->$method($value);
}
else
{
return $this->proxy->$method($value);
}
}
}
class DocumentResultItem extends QueryResultItem
{
protected $filesize;
protected $live;
protected $version;
protected $mimeType;
protected $filename;
protected $thumbnail; // TODO: if not null, gui can display a thumbnail
protected $viewer; // TODO: if not null, a viewer can be used to view the document
protected $document;
protected $checkedOutUser;
protected $dateCheckedout;
protected $workflowState;
protected $workflow;
protected $modifiedBy;
protected $dateModified;
protected $createdBy;
protected $dateCreated;
protected $owner;
protected $immutable;
protected $deleted;
protected $status;
protected $folderId;
protected $storagePath;
protected $documentType;
protected $mimeIconPath;
protected $mimeDisplay;
protected $oemDocumentNo;
protected $inclStatus = true;
public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null, $inclStatus = true)
{
parent::__construct($document_id, $title, $rank, $text, $fullpath);
$this->live = true;
$this->inclStatus = $inclStatus;
$this->loadDocumentInfo();
}
// TODO: this is bad. must refactor to do the query on the group of documents.
public function loadDocumentInfo()
{
global $default;
$sql = "SELECT
d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
FROM
documents d
INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
LEFT JOIN folders f ON f.id=d.folder_id
LEFT JOIN users cou ON d.checked_out_user_id=cou.id
LEFT JOIN workflows w ON dmv.workflow_id=w.id
LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
LEFT JOIN users mbu ON d.modified_user_id=mbu.id
LEFT JOIN users cbu ON d.creator_id=cbu.id
LEFT JOIN users ou ON d.owner_id=ou.id
WHERE
d.id=$this->id";
if($this->inclStatus){
$sql .= " AND d.status_id = 1";
}
$result = DBUtil::getOneResult($sql);
if (PEAR::isError($result) || empty($result))
{
$this->live = false;
if (PEAR::isError($result))
{
throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
}
$default->log->error('QueryResultItem: $result is null');
$msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
$default->log->error('QueryResultItem: ' . $msg);
// TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
throw new IndexerInconsistencyException(sprintf(_kt('%s') , $msg));
}
// document_id, relevance, text, title
$this->documentType = $result['document_type'];
$this->filename=$result['filename'];
$this->filesize = KTUtil::filesizeToString($result['filesize']);
$this->folderId = $result['folder_id'];
$this->title = $result['title'];
$this->createdBy = $result['createdbyuser'];
$this->dateCreated = $result['created'];
$this->modifiedBy = $result['modifiedbyuser'];
$this->dateModified = $result['modified'];
$this->checkedOutUser = $result['checkoutuser'];
$this->dateCheckedout = $result['checkedout'];
$this->owner = $result['owneruser'];
$this->version = $result['major_version'] . '.' . $result['minor_version'];
$this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
$this->workflow = $result['workflow'];
$this->workflowState = $result['workflowstate'];
$this->oemDocumentNo = $result['oem_no'];
if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
if (is_null($result['name']))
{
$this->fullpath = '(orphaned)';
}
else
{
$this->fullpath = $result['full_path'];
}
$this->mimeType = $result['mimetype'];
$this->mimeIconPath = $result['mime_icon_path'];
if (empty($this->mimeIconPath))
{
$this->mimeIconPath = 'unspecified_type';
}
$this->mimeDisplay = $result['mime_display'];
$this->storagePath = $result['storage_path'];
$this->status = Document::getStatusString($result['status_id']);
}
public function getDocumentID() { return $this->getId(); }
public function getIsLive() { return (bool) $this->live; }
public function getFilesize() { return $this->filesize; }
public function getVersion() { return (string) $this->version; }
public function getFilename() { return (string)$this->filename; }
public function getFolderId() { return (int)$this->folderId; }
public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
public function getDocument() { return Document::get($this->id); }
public function getIsAvailable() { return $this->Document->isLive(); }
public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
public function getWorkflowOnly() { return (string)$this->workflow; }
public function getWorkflow() { return $this->getWorkflow(); }
public function getWorkflowStateOnly() { return (string)$this->workflowState; }
public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
public function getWorkflowAndState() {
if (is_null($this->workflow))
{
return '';
}
return "$this->workflow - $this->workflowState";
}
public function getMimeType() { return (string) $this->mimeType; }
public function getMimeIconPath() { return (string) $this->mimeIconPath; }
public function getMimeDisplay() { return (string) $this->mimeDisplay; }
public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
public function getModifiedBy() { return (string) $this->modifiedBy; }
public function getDateModified() { return (string) $this->dateModified; }
public function getCreatedBy() { return (string) $this->createdBy; }
public function getDateCreated() { return (string) $this->dateCreated; }
public function getOwner() { return (string) $this->owner; }
public function getOwnedBy() { return $this->getOwner(); }
public function getIsImmutable() { return (bool) $this->immutable; }
public function getImmutable() { return $this->getIsImmutable(); }
public function getStatus() { return $this->status; }
public function getStoragePath() { return $this->storagePath; }
public function getDocumentType() { return $this->documentType; }
public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
public function getCanBeReadByUser() {
if (!$this->live)
return false;
if (Permission::userHasDocumentReadPermission($this->Document))
return true;
if (Permission::adminIsInAdminMode())
return true;
return false;
}
}
class FolderResultItem extends QueryResultItem
{
protected $folder;
protected $createdBy;
protected $parentId;
public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
{
parent::__construct($folder_id, $title, $rank, $text, $fullpath);
$this->loadFolderInfo();
}
public function getFolderID() { return $this->getId(); }
public function getParentID() { return $this->parentId; }
public function getCreatedBy() { return $this->createdBy; }
public function getMimeIconPath() { return 'folder'; }
public function getFolder() { return Folder::get($this->getFolderID()); }
public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
public function loadFolderInfo()
{
global $default;
$folder = $this->getFolder();
if (PEAR::isError($folder))
{
throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
}
$this->title = $folder->getName();
$this->fullpath = '/' . $folder->getFullPath();
$this->parentId = $folder->getParentId();
$user = User::get($folder->getCreatorID());
$this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
}
}
class DocumentShortcutResultItem extends ProxyResultItem
{
public function getDocumentID() { return $this->getId(); }
public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
}
class FolderShortcutResultItem extends ProxyResultItem
{
var $parentId;
var $linkedId;
var $full_path;
public function getFolderID() { return $this->getId(); }
public function getMimeIconPath() { return 'folder_shortcut'; }
}
function MatchResultCompare($a, $b)
{
if ($a->Rank == $b->Rank) {
return 0;
}
return ($a->Rank < $b->Rank) ? -1 : 1;
}
abstract class Indexer
{
/**
* Cache of extractors
*
* @var array
*/
private $extractorCache;
/**
* Indicates if the indexer will do logging.
*
* @var boolean
*/
private $debug;
/**
* Cache on mime related hooks
*
* @var unknown_type
*/
private $mimeHookCache;
/**
* Cache on general hooks.
*
* @var array
*/
private $generalHookCache;
/**
* This is a path to the extractors.
*
* @var string
*/
private $extractorPath;
/**
* This is a path to the hooks.
*
* @var string
*/
private $hookPath;
private $enabledExtractors;
protected $inclStatus = true;
/**
* Initialise the indexer
*
*/
protected function __construct()
{
$config = KTConfig::getSingleton();
$this->extractorCache = array();
$this->debug = $config->get('indexer/debug', true);
$this->hookCache = array();
$this->generalHookCache = array();
$this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
$this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
$this->loadExtractorStatus();
}
/**
* Get the list if enabled extractors
*
*/
private function loadExtractorStatus()
{
$sql = "SELECT id, name FROM mime_extractors WHERE active=1";
$rs = DBUtil::getResultArray($sql);
$this->enabledExtractors = array();
foreach($rs as $item)
{
$this->enabledExtractors[] = $item['name'];
}
}
private function isExtractorEnabled($extractor)
{
return in_array($extractor, $this->enabledExtractors);
}
/**
* Set whether to use status of 1 for live documents only or return deleted and archived documents as well
*
* @param bool $incl
*/
public function setIncludeStatus($incl)
{
$this->inclStatus = $incl;
}
/**
* Returns a reference to the main class
*
* @return Indexer
*/
public static function get()
{
static $singleton = null;
if (is_null($singleton))
{
$config = KTConfig::getSingleton();
$classname = $config->get('indexer/coreClass');
require_once('indexing/indexers/' . $classname . '.inc.php');
if (!class_exists($classname))
{
throw new Exception("Class '$classname' does not exist.");
}
$singleton = new $classname;
}
return $singleton;
}
public abstract function deleteDocument($docid);
/**
* Remove the association of all extractors to mime types on the database.
*
*/
public function clearExtractors()
{
global $default;
$sql = "update mime_types set extractor_id=null";
DBUtil::runQuery($sql);
$sql = "delete from mime_extractors";
DBUtil::runQuery($sql);
if ($this->debug) $default->log->debug('clearExtractors');
}
/**
* lookup the name of the extractor class based on the mime type.
*
* @param string $type
* @return string
*/
public static function resolveExtractor($type)
{
global $default;
$sql = "select extractor from mime_types where filetypes='$type'";
$class = DBUtil::getOneResultKey($sql,'extractor');
if (PEAR::isError($class))
{
$default->log->error("resolveExtractor: cannot resolve $type");
return $class;
}
if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
return $class;
}
/**
* Return all the discussion text.
*
* @param int $docid
* @return string
*/
public static function getDiscussionText($docid)
{
$sql = "SELECT
dc.subject, dc.body
FROM
discussion_threads dt
INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
WHERE
dt.document_id=$docid";
$result = DBUtil::getResultArray($sql);
$text = '';
foreach($result as $record)
{
$text .= $record['subject'] . "\n" . $record['body'] . "\n";
}
return $text;
}
/**
* Schedule the indexing of a document.
*
* @param string $document
* @param string $what
*/
public static function index($document, $what='A')
{
global $default;
if (is_numeric($document))
{
$document = Document::get($document+0);
}
if (PEAR::isError($document))
{
$default->log->error("index: Could not index document: " .$document->getMessage());
return;
}
$document_id = $document->getId();
$userid=$_SESSION['userID'];
if (empty($userid)) $userid=1;
// we dequeue the document so that there are no issues when enqueuing
Indexer::unqueueDocument($document_id);
// enqueue item
$sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
DBUtil::runQuery($sql);
$default->log->debug("index: Queuing indexing of $document_id");
// Appending the process queue to the index for convenience
// Don't want to complicate matters by creating too many new classes and files
Indexer::unqueueDocFromProcessing($document_id);
// enqueue item
$date = date('Y-m-d H:i:s');
$sql = "INSERT INTO process_queue(document_id, date_added) VALUES($document_id, '$date')";
DBUtil::runQuery($sql);
$default->log->debug("Processing queue: Queuing document for processing - $document_id");
}
private static function incrementCount()
{
// Get count from system settings
$count = Indexer::getIndexedDocumentCount();
$count = (int)$count + 1;
Indexer::updateIndexedDocumentCount($count);
}
public static function getIndexedDocumentCount()
{
$count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
return (int) $count;
}
public static function updateIndexedDocumentCount($cnt = 0)
{
KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
}
public static function reindexQueue()
{
$sql = "UPDATE index_files SET processdate = null";
DBUtil::runQuery($sql);
}
public static function reindexDocument($documentId)
{
$sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
DBUtil::runQuery($sql);
}
public static function indexAll()
{
$userid=$_SESSION['userID'];
if (empty($userid)) $userid=1;
$sql = "DELETE FROM index_files";
DBUtil::runQuery($sql);
$sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
DBUtil::runQuery($sql);
}
public static function processAll()
{
// Empty the queue
$sql = "DELETE FROM process_queue";
DBUtil::runQuery($sql);
// Add all documents to the queue
$sql = "INSERT INTO process_queue(document_id, date_added) SELECT id, now() FROM documents WHERE status_id=1 and id not in (select document_id from process_queue)";
DBUtil::runQuery($sql);
}
public static function indexFolder($folder)
{
$userid=$_SESSION['userID'];
if (empty($userid)) $userid=1;
if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
{
throw new Exception('Folder expected');
}
$full_path = $folder->getFullPath();
$sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
DBUtil::runQuery($sql);
}
/**
* Clearout the scheduling of documents that no longer exist.
*
*/
public static function clearoutDeleted()
{
global $default;
$sql = 'DELETE FROM
index_files
WHERE
document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
DBUtil::runQuery($sql);
$default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
// Multiple indexing processes cannot occur at the same time - the lock file prevents this.
// However if the indexing is interrupted the documents can get stuck in the queue with the processdate set
// but never having been indexed. To prevent this we will clear the processdate on all documents without errors.
$sql = 'UPDATE index_files SET processdate = null where processdate is not null and status_msg is null';
$res = DBUtil::runQuery($sql);
if(PEAR::isError($res)){
$default->log->error("Indexer::clearoutDeleted: something happened ".$res->getMessage);
}
$default->log->debug("Indexer::clearoutDeleted: resetting processdate for documents that may be stuck");
}
/**
* Clearout the processing of documents that no longer exist.
*
*/
public static function clearoutDeletedFromProcessor()
{
global $default;
$sql = 'DELETE FROM
process_queue
WHERE
document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
NOT EXISTS(SELECT process_queue.document_id FROM documents WHERE process_queue.document_id=documents.id)';
$result = DBUtil::runQuery($sql);
$default->log->debug("Process queue: removed documents from processing queue that have been deleted");
}
/**
* Check if a document is scheduled to be indexed
*
* @param mixed $document This may be a document or document id
* @return boolean
*/
public static function isDocumentScheduled($document)
{
if (is_numeric($document))
{
$docid = $document;
}
else if ($document instanceof Document)
{
$docid = $document->getId();
}
else
{
return false;
}
$sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
$result = DBUtil::getResultArray($sql);
return count($result) > 0;
}
/**
* Filters text removing redundant characters such as continuous newlines and spaces.
*
* @param string $filename
*/
private function filterText($filename)
{
$content = file_get_contents($filename);
// if the file is empty skip the filter - document was probably empty
if(empty($content)){
global $default;
$default->log->debug('No text was extracted from the document. Either it was empty or there was a problem with the extraction');
return true;
}
$src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
$tgt = array("\n","\n",' ',' ',' ');
// shrink what is being stored.
do
{
$orig = $content;
$content = preg_replace($src, $tgt, $content);
} while ($content != $orig);
return file_put_contents($filename, $content) !== false;
}
/**
* Load hooks for text extraction process.
*
*/
private function loadExtractorHooks()
{
$this->generalHookCache = array();
$this->mimeHookCache = array();
$dir = opendir(SearchHelper::correctPath($this->hookPath));
while (($file = readdir($dir)) !== false)
{
if (substr($file,-12) == 'Hook.inc.php')
{
require_once($this->hookPath . '/' . $file);
$class = substr($file, 0, -8);
if (!class_exists($class))
{
continue;
}
$hook = new $class;
if (!($class instanceof ExtractorHook))
{
continue;
}
$mimeTypes = $hook->registerMimeTypes();
if (is_null($mimeTypes))
{
$this->generalHookCache[] = & $hook;
}
else
{
foreach($mimeTypes as $type)
{
$this->mimeHookCache[$type][] = & $hook;
}
}
}
}
closedir($dir);
}
/**
* This is a refactored function to execute the hooks.
*
* @param DocumentExtractor $extractor
* @param string $phase
* @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
*/
private function executeHook($extractor, $phase, $mimeType = null)
{
$hooks = array();
if (is_null($mimeType))
{
$hooks = $this->generalHookCache;
}
else
{
if (array_key_exists($mimeType, $this->mimeHookCache))
{
$hooks = $this->mimeHookCache[$mimeType];
}
}
if (empty($hooks))
{
return;
}
foreach($hooks as $hook)
{
$hook->$phase($extractor);
}
}
private function doesDiagnosticsPass($simple=false)
{
global $default;
$config =& KTConfig::getSingleton();
// create a index log lock file in case there are errors, and we don't need to log them forever!
// this function will create the lockfile if an error is detected. It will be removed as soon
// as the problems with the indexer are removed.
$lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
$diagnosis = $this->diagnose();
if (!is_null($diagnosis))
{
if (!is_file($lockFile))
{
$default->log->error(_kt('Indexer problem: ') . $diagnosis);
}
touch($lockFile);
return false;
}
if ($simple)
{
return true;
}
$diagnosis = $this->diagnoseExtractors();
if (!empty($diagnosis))
{
if (!is_file($lockFile))
{
foreach($diagnosis as $diag)
{
$default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
}
}
touch($lockFile);
return false;
}
if (is_file($lockFile))
{
$default->log->info(_kt('Issues with the indexer have been resolved!'));
unlink($lockFile);
}
return true;
}
/**
* This does the initial mime type association between mime types and text extractors
*
*/
public function checkForRegisteredTypes()
{
global $default;
// we are only doing this once!
$initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
if ($initRegistered)
{
return;
}
if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
$date = date('Y-m-d H:i');
$sql = "UPDATE scheduler_tasks SET run_time='$date'";
DBUtil::runQuery($sql);
$this->registerTypes(true);
$disable = array(
'windows'=>array('PSExtractor'),
'unix' => array()
);
$disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
if (!empty($disableForOS))
{
$disableForOS = '\'' . implode("','", $disableForOS) .'\'';
$sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
DBUtil::runQuery($sql);
$default->log->info("checkForRegisteredTypes: disabled '$extractor'");
}
$this->loadExtractorStatus();
if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
KTUtil::setSystemSetting('mimeTypesRegistered', true);
}
private function updatePendingDocumentStatus($documentId, $message, $level)
{
$this->indexingHistory .= "\n" . $level . ': ' . $message;
$message = sanitizeForSQL($this->indexingHistory);
$sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
DBUtil::runQuery($sql);
}
private $restartCurrentBatch = false;
public function restartBatch()
{
$this->restartCurrentBatch = true;
}
/**
*
* @param int $documentId
* @param string $message
* @param string $level This may be info, error, debug
*/
private function logPendingDocumentInfoStatus($documentId, $message, $level)
{
$this->updatePendingDocumentStatus($documentId, $message, $level);
global $default;
switch ($level)
{
case 'debug':
if ($this->debug)
{
$default->log->debug($message);
}
break;
default:
$default->log->$level($message);
}
}
public function getExtractor($extractorClass)
{
if (empty($extractorClass))
{
return null;
}
$includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
if (!file_exists($includeFile))
{
throw new Exception("Extractor file does not exist: $includeFile");
}
require_once($includeFile);
if (!class_exists($extractorClass))
{
throw new Exception("Extractor '$classname' not defined in file: $includeFile");
}
$extractor = new $extractorClass();
if (!($extractor instanceof DocumentExtractor))
{
throw new Exception("Class $classname was expected to be of type DocumentExtractor");
}
return $extractor;
}
public static function getIndexingQueue($problemItemsOnly=true)
{
if ($problemItemsOnly)
{
$sql = "SELECT
iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
ORDER BY indexdate ";
}
else
{
$sql = "SELECT
iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
ORDER BY indexdate ";
}
$aResult = DBUtil::getResultArray($sql);
return $aResult;
}
public static function getPendingIndexingQueue()
{
return Indexer::getIndexingQueue(false);
}
public function getIndexStatistics()
{
$optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
$noOptimisation = false;
if ($optimisationDate == '')
{
$optimisationDate = _kt('N/A');
$optimisationPeriod = $optimisationDate;
}
else
{
$optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
$noOptimisation = $optimisationPeriod['days'] > 2;
$optimisationPeriod = $optimisationPeriod['str'];
$optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
}
$indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
if ($indexingDate == '')
{
$indexingDate = _kt('N/A');
$indexingPeriod = $indexingDate;
}
else
{
$indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
$indexingDate = date('Y-m-d H:i:s', $indexingDate);
}
$index = Indexer::get();
$docsInIndex = $index->getDocumentsInIndex();
// we are only interested in documents that are active
$sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1";
$docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
$sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
$errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
$sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
$docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
if ($docsInRepository == 0)
{
$indexingCoverage = '0.00%';
$queueCoverage = $indexingCoverage;
}
else
{
// compute indexing coverage
$indexingCoverage = _kt('Not Available');
if (is_numeric($docsInIndex))
{
$indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
$indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
}
// compute queue coverage
$queueCoverage = _kt('Not Available');
if (is_numeric($docsInQueue))
{
$queueCoverage = ($docsInQueue * 100) / $docsInRepository;
$queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
}
}
$stats = array(
'optimisationDate'=>$optimisationDate,
'optimisationPeriod'=>$optimisationPeriod,
'indexingDate'=>$indexingDate,
'indexingPeriod'=>$indexingPeriod,
'docsInIndex'=>$docsInIndex,
'docsInQueue'=>$docsInQueue,
'errorsInQueue'=>$errorsInQueue,
'docsInRepository'=>$docsInRepository,
'indexingCoverage'=>$indexingCoverage,
'queueCoverage'=>$queueCoverage,
'noOptimisation'=>$noOptimisation
);
return $stats;
}
public function updateIndexStats()
{
$stats = $this->getIndexStatistics();
KTUtil::setSystemSetting('indexerStats', serialize($stats));
$diagnosis = $this->diagnose();
KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
$extractorDiagnosis = $this->diagnoseExtractors();
KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
}
/**
* Perform diagnostics and pre-indexing setup
* Refactored from indexDocuments()
*/
public function preIndexingSetup()
{
global $default;
// Check mimetypes and load the text extractors
$this->checkForRegisteredTypes();
// Check diagnostics on extractors
if (!$this->doesDiagnosticsPass())
{
//unlink($indexLockFile);
if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The administration section will provide more information.');
return false;
}
// Load extractor hooks
$this->loadExtractorHooks();
$this->storageManager = KTStorageManagerUtil::getSingleton();
// Config setting - urls/tmpDirectory
$this->tempPath = $default->tmpDirectory;
}
/**
* Get the queue of documents for indexing
* Refactored from indexDocuments()
*/
public function getDocumentsQueue($max = null)
{
global $default;
// Cleanup the queue
Indexer::clearoutDeleted();
$date = date('Y-m-d H:i:s');
// identify the indexers that must run
// mysql specific limit!
$sql = "SELECT
iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
FROM
index_files iff
INNER JOIN documents d ON iff.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
WHERE
(iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
ORDER BY indexdate
LIMIT $max";
$result = DBUtil::getResultArray($sql);
if (PEAR::isError($result))
{
//unlink($indexLockFile);
if ($this->debug) $default->log->error('indexDocuments: stopping - db error');
return;
}
KTUtil::setSystemSetting('luceneIndexingDate', time());
// bail if no work to do
if (count($result) == 0)
{
//unlink($indexLockFile);
if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
return;
}
// identify any documents that need indexing and mark them
// so they are not taken in a followup run
$ids = array();
foreach($result as $docinfo)
{
$ids[] = $docinfo['document_id'];
}
// mark the documents as being processed
$ids=implode(',',$ids);
$sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
DBUtil::runQuery($sql);
return $result;
}
/**
* Get the queue of documents for processing
*
*/
public function getDocumentProcessingQueue($max = null)
{
global $default;
$max = (empty($max)) ? 20 : $max;
// Cleanup the queue
Indexer::clearoutDeletedFromProcessor();
$date = date('Y-m-d H:i:s');
// identify the indexers that must run
// mysql specific limit!
$sql = "SELECT
pq.document_id, mt.filetypes, mt.mimetypes
FROM
process_queue pq
INNER JOIN documents d ON pq.document_id=d.id
INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
INNER JOIN mime_types mt ON dcv.mime_id=mt.id
WHERE
(pq.date_processed IS NULL or pq.date_processed < date_sub('$date', interval 1 day)) AND dmv.status_id=1
ORDER BY date_added
LIMIT $max";
$result = DBUtil::getResultArray($sql);
if (PEAR::isError($result))
{
$default->log->error('Processing queue: stopping - db error: '.$result->getMessage());
return;
}
// bail if no work to do
if (count($result) == 0)
{
$default->log->debug('Processing queue: stopping - no work to be done');
return;
}
return $result;
}
/**
* Process a document - extract text and index it
* Refactored from indexDocuments()
*
* @param unknown_type $docinfo
*/
public function processDocument($document, $docinfo)
{
global $default;
static $extractorCache = array();
// increment indexed documents count
Indexer::incrementCount();
// if document is a zero byte file, let's just unqueue and return
if ($document->getFileSize() == 0)
{
Indexer::unqueueDocument($docinfo['document_id'],
sprintf(_kt("Zero Byte documents do not need to be indexed: %d"),
$docinfo['document_id']));
return;
}
$docId = $docinfo['document_id'];
$extension = $docinfo['filetypes'];
$mimeType = $docinfo['mimetypes'];
$extractorClass = $docinfo['extractor'];
$indexDocument = in_array($docinfo['what'], array('A','C'));
$indexDiscussion = in_array($docinfo['what'], array('A','D'));
$this->indexingHistory = '';
$tempPath = $this->tempPath;
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
if (empty($extractorClass))
{
/*
if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
*/
if ($indexDiscussion)
{
$indexDocument = false;
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
}
else
{
Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
return ;
}
}
else
{
/*
If an extractor is available, we must ensure it is enabled.
*/
if (!$this->isExtractorEnabled($extractorClass))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
return ;
}
}
if ($this->debug)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
}
if ($this->restartCurrentBatch)
{
Indexer::unqueueDocument($docId);
Indexer::index($docId, 'A');
return ;
}
$filename = $document->getFileName();
if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
{
Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
return ;
}
$removeFromQueue = true;
if ($indexDocument)
{
if (array_key_exists($extractorClass, $extractorCache))
{
$extractor = $extractorCache[$extractorClass];
}
else
{
$extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
}
if (!($extractor instanceof DocumentExtractor))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
return ;
}
$version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
$sourceFile = $this->storageManager->temporaryFile($document);
if (empty($sourceFile) || !is_file($sourceFile))
{
Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
continue;
}
if ($extractor->needsIntermediateSourceFile())
{
//$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
$intermediate = $tempPath . '/'. $docId . '.' . $extension;
$result = @copy($sourceFile, $intermediate);
if ($result === false)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
// problem. lets try again later. probably permission related. log the issue.
continue;
}
$sourceFile = $intermediate;
}
$extractor->setSourceFile($sourceFile);
$extractor->setMimeType($mimeType);
$extractor->setExtension($extension);
$extractor->setDocument($document);
$extractor->setIndexingStatus(null);
$extractor->setExtractionStatus(null);
$targetFile = tempnam($tempPath, 'ktindexer');
$extractor->setTargetFile($targetFile);
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
$this->executeHook($extractor, 'pre_extract');
$this->executeHook($extractor, 'pre_extract', $mimeType);
$removeFromQueue = false;
if ($extractor->extractTextContent())
{
// the extractor may need to create another target file
$targetFile = $extractor->getTargetFile();
$extractor->setExtractionStatus(true);
$this->executeHook($extractor, 'pre_index');
$this->executeHook($extractor, 'pre_index', $mimeType);
$title = $document->getName();
if ($indexDiscussion)
{
if (!$this->filterText($targetFile))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
}
else
{
$indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
$removeFromQueue = $indexStatus;
if (!$indexStatus)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
}
$extractor->setIndexingStatus($indexStatus);
}
}
else
{
if (!$this->filterText($targetFile))
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
}
else
{
$indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
$removeFromQueue = $indexStatus;
if (!$indexStatus)
{
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
$this->logPendingDocumentInfoStatus($docId, '', 'error');
}
$extractor->setIndexingStatus($indexStatus);
}
}
$this->executeHook($extractor, 'post_index', $mimeType);
$this->executeHook($extractor, 'post_index');
}
else
{
$extractor->setExtractionStatus(false);
$this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
$this->logPendingDocumentInfoStatus($docId, '', 'error');
}
$this->executeHook($extractor, 'post_extract', $mimeType);
$this->executeHook($extractor, 'post_extract');
if ($extractor->needsIntermediateSourceFile())
{
@unlink($sourceFile);
}
@unlink($targetFile);
}
else
{
$indexStatus = $this->indexDiscussion($docId);
$removeFromQueue = $indexStatus;
}
if ($removeFromQueue)
{
Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
}
else
{
if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
}
}
/**
* The main function that may be called repeatedly to index documents.
*
* @param int $max Default 20
*/
public function indexDocuments($max=null)
{
global $default;
if($default->enableIndexing){
$this->preIndexingSetup();
if (is_null($max))
{
$max = $default->batchDocuments;
}
$queue = $this->getDocumentsQueue($max);
// Process queue
foreach($queue as $item){
// index document
$this->processDocument($item);
}
}
if ($this->debug) $default->log->debug('indexDocuments: done');
return;
}
public function migrateDocuments($max=null)
{
global $default;
$default->log->info(_kt('migrateDocuments: starting'));
if (!$this->doesDiagnosticsPass(true))
{
$default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
return;
}
if (KTUtil::getSystemSetting('migrationComplete') == 'true')
{
$default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
return;
}
$config =& KTConfig::getSingleton();
if (is_null($max))
{
$max = $config->get('indexer/batchMigrateDocument',500);
}
$lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
if (is_file($lockFile))
{
$default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
return;
}
touch($lockFile);
$startTime = KTUtil::getSystemSetting('migrationStarted');
if (is_null($startTime))
{
KTUtil::setSystemSetting('migrationStarted', time());
}
$maxLoops = 5;
$max = ceil($max / $maxLoops);
$start =KTUtil::getBenchmarkTime();
$noDocs = false;
$numDocs = 0;
for($loop=0;$loop<$maxLoops;$loop++)
{
$sql = "SELECT
document_id, document_text
FROM
document_text
ORDER BY document_id
LIMIT $max";
$result = DBUtil::getResultArray($sql);
if (PEAR::isError($result))
{
$default->log->info(_kt('migrateDocuments: db error'));
break;
}
$docs = count($result);
if ($docs == 0)
{
$noDocs = true;
break;
}
$numDocs += $docs;
foreach($result as $docinfo)
{
$docId = $docinfo['document_id'];
$document = Document::get($docId);
if (PEAR::isError($document) || is_null($document))
{
$sql = "DELETE FROM document_text WHERE document_id=$docId";
DBUtil::runQuery($sql);
$default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
continue;
}
$version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
$targetFile = tempnam($tempPath, 'ktindexer');
if (file_put_contents($targetFile, $docinfo['document_text']) === false)
{
$default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
continue;
}
// free memory asap ;)
unset($docinfo['document_text']);
$title = $document->getName();
$indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
if ($indexStatus)
{
$sql = "DELETE FROM document_text WHERE document_id=$docId";
DBUtil::runQuery($sql);
}
else
{
$default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
}
@unlink($targetFile);
}
}
@unlink($lockFile);
$time = KTUtil::getBenchmarkTime() - $start;
KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
$default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
if ($noDocs)
{
$default->log->info(_kt('migrateDocuments: Completed!'));
KTUtil::setSystemSetting('migrationComplete', 'true');
schedulerUtil::deleteByName('Index Migration');
$default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
}
}
/**
* Index a document. The base class must override this function.
*
* @param int $docId
* @param string $textFile
*/
protected abstract function indexDocument($docId, $textFile, $title, $version);
public function updateDocumentIndex($docId, $text)
{
$config = KTConfig::getSingleton();
$tempPath = $config->get("urls/tmpDirectory");
$tempFile = tempnam($tempPath,'ud_');
file_put_contents($tempFile, $text);
$document = Document::get($docId);
$title = $document->getDescription();
$version = $document->getVersion();
$result = $this->indexDocument($docId, $tempFile, $title, $version);
if (file_exists($tempFile))
{
unlink($tempFile);
}
return $result;
}
/**
* Index a discussion. The base class must override this function.
*
* @param int $docId
*/
protected abstract function indexDiscussion($docId);
/**
* Diagnose the indexer. e.g. Check that the indexing server is running.
*
*/
public abstract function diagnose();
/**
* Diagnose the extractors.
*
* @return array
*/
public function diagnoseExtractors()
{
$diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
$diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
return $diagnosis;
}
/**
* This is a refactored diagnose function.
*
* @param string $path
* @param string $class
* @param string $extension
* @return array
*/
private function _diagnose($path, $baseclass, $extension)
{
global $default;
$diagnoses = array();
$dir = opendir(SearchHelper::correctPath($path));
$extlen = - strlen($extension);
while (($file = readdir($dir)) !== false)
{
if (substr($file,0,1) == '.')
{
continue;
}
if (substr($file,$extlen) != $extension)
{
$default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
continue;
}
require_once($path . '/' . $file);
$class = substr($file, 0, -8);
if (!class_exists($class))
{
$default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
continue;
}
if (!$this->isExtractorEnabled($class))
{
$default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
continue;
}
$extractor = new $class();
if (!is_a($extractor, $baseclass))
{
$default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
continue;
}
$types = $extractor->getSupportedMimeTypes();
if (empty($types))
{
if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
continue;
}
$diagnosis=$extractor->diagnose();
if (empty($diagnosis))
{
continue;
}
$diagnoses[$class] = array(
'name'=>$extractor->getDisplayName(),
'diagnosis'=>$diagnosis
);
}
closedir($dir);
return $diagnoses;
}
/**
* Register the extractor types.
*
* @param boolean $clear. Optional. Defaults to false.
*/
public function registerTypes($clear=false)
{
if ($clear)
{
$this->clearExtractors();
}
$dir = opendir(SearchHelper::correctPath($this->extractorPath));
while (($file = readdir($dir)) !== false)
{
if (substr($file,-17) == 'Extractor.inc.php')
{
require_once($this->extractorPath . '/' . $file);
$class = substr($file, 0, -8);
if (!class_exists($class))
{
// if the class does not exist, we can't do anything.
continue;
}
$extractor = new $class;
if ($extractor instanceof DocumentExtractor)
{
$extractor->registerMimeTypes();
}
}
}
closedir($dir);
}
/**
* This is used as a possible obtimisation effort. It may be overridden in that case.
*
* @param int $docId
* @param string $textFile
*/
protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
{
$this->indexDocument($docId, $textFile, $title, $version);
$this->indexDiscussion($docId);
}
/**
* Remove the document from the indexing queue. This is normally called when it has been processed.
*
* @param int $docid
*/
public static function unqueueDocument($docid, $reason=false, $level='debug')
{
$sql = "DELETE FROM index_files WHERE document_id=$docid";
DBUtil::runQuery($sql);
if ($reason !== false)
{
global $default;
$default->log->$level("Indexer: removing document $docid from the queue - $reason");
}
}
/**
* Remove the document from the processing queue. This is normally called when it has been processed.
*
* @param int $docid
*/
public static function unqueueDocFromProcessing($docid, $reason=false, $level='debug')
{
$sql = "DELETE FROM process_queue WHERE document_id=$docid";
$result = DBUtil::runQuery($sql);
if ($reason !== false)
{
global $default;
$default->log->$level("Processor queue: removing document $docid from the queue - $reason");
}
}
/**
* Run a query on the index.
*
* @param string $query
* @return array
*/
public abstract function query($query);
/**
* Converts an integer to a string that can be easily compared and reversed.
*
* @param int $int
* @return string
*/
public static function longToString($int)
{
$maxlen = 14;
$a2z = array('a','b','c','d','e','f','g','h','i','j');
$o29 = array('0','1','2','3','4','5','6','7','8','9');
$l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
return str_replace($o29, $a2z, $l);
}
/**
* Converts a string to an integer.
*
* @param string $str
* @return int
*/
public static function stringToLong($str)
{
$a2z = array('a','b','c','d','e','f','g','h','i','j');
$o29 = array('0','1','2','3','4','5','6','7','8','9');
$int = str_replace($a2z, $o29, $str) + 0;
return $int;
}
/**
* Possibly we can optimise indexes. This method must be overriden.
* The new function must call the parent!
*
*/
public function optimise()
{
KTUtil::setSystemSetting('luceneOptimisationDate', time());
}
/**
* Shuts down the indexer
*
*/
public function shutdown()
{
// do nothing generally
}
/**
* Returns the name of the indexer.
*
* @return string
*/
public abstract function getDisplayName();
/**
* Returns the number of non-deleted documents in the index.
*
* @return int
*/
public abstract function getDocumentsInIndex();
public abstract function isDocumentIndexed($documentId);
/**
* Returns the path to the index directory
*
* @return string
*/
public function getIndexDirectory()
{
$config = KTConfig::getSingleton();
$directory = $config->get('indexer/luceneDirectory');
return $directory;
}
}
?>