SearchAPI-Custom.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. <?php
  2. /**
  3. * Simple Machines Forum (SMF)
  4. *
  5. * @package SMF
  6. * @author Simple Machines http://www.simplemachines.org
  7. * @copyright 2012 Simple Machines
  8. * @license http://www.simplemachines.org/about/smf/license.php BSD
  9. *
  10. * @version 2.1 Alpha 1
  11. */
  12. if (!defined('SMF'))
  13. die('Hacking attempt...');
  14. /**
  15. * Custom Search API class .. used when custom SMF index is used
  16. */
  17. class custom_search
  18. {
  19. /**
  20. *This is the last version of SMF that this was tested on, to protect against API changes.
  21. * @var type
  22. */
  23. public $version_compatible = 'SMF 2.1 Alpha 1';
  24. /**
  25. *This won't work with versions of SMF less than this.
  26. * @var type
  27. */
  28. public $min_smf_version = 'SMF 2.1 Alpha 1';
  29. /**
  30. * Is it supported?
  31. * @var type
  32. */
  33. public $is_supported = true;
  34. /**
  35. * Index Settings
  36. * @var type
  37. */
  38. protected $indexSettings = array();
  39. /**
  40. * What words are banned?
  41. * @var type
  42. */
  43. protected $bannedWords = array();
  44. /**
  45. * What is the minimum word length?
  46. * @var type
  47. */
  48. protected $min_word_length = null;
  49. /**
  50. * What databases support the custom index?
  51. * @var type
  52. */
  53. protected $supported_databases = array('mysql', 'postgresql', 'sqlite');
  54. /**
  55. * constructor function
  56. *
  57. * @return type
  58. */
  59. public function __construct()
  60. {
  61. global $modSettings, $db_type;
  62. // Is this database supported?
  63. if (!in_array($db_type, $this->supported_databases))
  64. {
  65. $this->is_supported = false;
  66. return;
  67. }
  68. if (empty($modSettings['search_custom_index_config']))
  69. return;
  70. $this->indexSettings = unserialize($modSettings['search_custom_index_config']);
  71. $this->bannedWords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
  72. $this->min_word_length = $this->indexSettings['bytes_per_word'];
  73. }
  74. /**
  75. * Check whether the search can be performed by this API.
  76. *
  77. * @param type $methodName
  78. * @param type $query_params
  79. * @return boolean
  80. */
  81. public function supportsMethod($methodName, $query_params = null)
  82. {
  83. switch ($methodName)
  84. {
  85. case 'isValid':
  86. case 'searchSort':
  87. case 'prepareIndexes':
  88. case 'indexedWordQuery':
  89. case 'postCreated':
  90. case 'postModified':
  91. return true;
  92. break;
  93. // All other methods, too bad dunno you.
  94. default:
  95. return false;
  96. return;
  97. }
  98. }
  99. /**
  100. * If the settings don't exist we can't continue.
  101. *
  102. * @return type
  103. */
  104. public function isValid()
  105. {
  106. global $modSettings;
  107. return !empty($modSettings['search_custom_index_config']);
  108. }
  109. /**
  110. * callback function for usort used to sort the fulltext results.
  111. * the order of sorting is: large words, small words, large words that
  112. * are excluded from the search, small words that are excluded.
  113. * @param string $a Word A
  114. * @param string $b Word B
  115. * @return int
  116. */
  117. public function searchSort($a, $b)
  118. {
  119. global $modSettings, $excludedWords;
  120. $x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
  121. $y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
  122. return $y < $x ? 1 : ($y > $x ? -1 : 0);
  123. }
  124. /**
  125. * Do we have to do some work with the words we are searching for to prepare them?
  126. *
  127. * @param type $word
  128. * @param type $wordsSearch
  129. * @param type $wordsExclude
  130. * @param type $isExcluded
  131. */
  132. public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
  133. {
  134. global $modSettings, $smcFunc;
  135. $subwords = text2words($word, $this->min_word_length, true);
  136. if (empty($modSettings['search_force_index']))
  137. $wordsSearch['words'][] = $word;
  138. // Excluded phrases don't benefit from being split into subwords.
  139. if (count($subwords) > 1 && $isExcluded)
  140. continue;
  141. else
  142. {
  143. foreach ($subwords as $subword)
  144. {
  145. if ($smcFunc['strlen']($subword) >= $this->min_word_length && !in_array($subword, $this->bannedWords))
  146. {
  147. $wordsSearch['indexed_words'][] = $subword;
  148. if ($isExcluded)
  149. $wordsExclude[] = $subword;
  150. }
  151. }
  152. }
  153. }
  154. /**
  155. * Search for indexed words.
  156. *
  157. * @param type $words
  158. * @param type $search_data
  159. * @return type
  160. */
  161. public function indexedWordQuery($words, $search_data)
  162. {
  163. global $modSettings, $smcFunc;
  164. $query_select = array(
  165. 'id_msg' => 'm.id_msg',
  166. );
  167. $query_inner_join = array();
  168. $query_left_join = array();
  169. $query_where = array();
  170. $query_params = $search_data['params'];
  171. if ($query_params['id_search'])
  172. $query_select['id_search'] = '{int:id_search}';
  173. $count = 0;
  174. foreach ($words['words'] as $regularWord)
  175. {
  176. $query_where[] = 'm.body' . (in_array($regularWord, $query_params['excluded_words']) ? ' NOT' : '') . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:complex_body_' . $count . '}';
  177. $query_params['complex_body_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($regularWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $regularWord), '\\\'') . '[[:>:]]';
  178. }
  179. if ($query_params['user_query'])
  180. $query_where[] = '{raw:user_query}';
  181. if ($query_params['board_query'])
  182. $query_where[] = 'm.id_board {raw:board_query}';
  183. if ($query_params['topic'])
  184. $query_where[] = 'm.id_topic = {int:topic}';
  185. if ($query_params['min_msg_id'])
  186. $query_where[] = 'm.id_msg >= {int:min_msg_id}';
  187. if ($query_params['max_msg_id'])
  188. $query_where[] = 'm.id_msg <= {int:max_msg_id}';
  189. $count = 0;
  190. if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
  191. foreach ($query_params['excluded_phrases'] as $phrase)
  192. {
  193. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:exclude_subject_phrase_' . $count . '}';
  194. $query_params['exclude_subject_phrase_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($phrase, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $phrase), '\\\'') . '[[:>:]]';
  195. }
  196. $count = 0;
  197. if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
  198. foreach ($query_params['excluded_subject_words'] as $excludedWord)
  199. {
  200. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:exclude_subject_words_' . $count . '}';
  201. $query_params['exclude_subject_words_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($excludedWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $excludedWord), '\\\'') . '[[:>:]]';
  202. }
  203. $numTables = 0;
  204. $prev_join = 0;
  205. foreach ($words['indexed_words'] as $indexedWord)
  206. {
  207. $numTables++;
  208. if (in_array($indexedWord, $query_params['excluded_index_words']))
  209. {
  210. $query_left_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_word = ' . $indexedWord . ' AND lsw' . $numTables . '.id_msg = m.id_msg)';
  211. $query_where[] = '(lsw' . $numTables . '.id_word IS NULL)';
  212. }
  213. else
  214. {
  215. $query_inner_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_msg = ' . ($prev_join === 0 ? 'm' : 'lsw' . $prev_join) . '.id_msg)';
  216. $query_where[] = 'lsw' . $numTables . '.id_word = ' . $indexedWord;
  217. $prev_join = $numTables;
  218. }
  219. }
  220. $ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ( '
  221. INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
  222. (' . implode(', ', array_keys($query_select)) . ')') : '') . '
  223. SELECT ' . implode(', ', $query_select) . '
  224. FROM {db_prefix}messages AS m' . (empty($query_inner_join) ? '' : '
  225. INNER JOIN ' . implode('
  226. INNER JOIN ', $query_inner_join)) . (empty($query_left_join) ? '' : '
  227. LEFT JOIN ' . implode('
  228. LEFT JOIN ', $query_left_join)) . '
  229. WHERE ' . implode('
  230. AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
  231. LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
  232. $query_params
  233. );
  234. return $ignoreRequest;
  235. }
  236. /**
  237. * After a post is made, we update the search index database
  238. *
  239. * @param type $msgOptions
  240. * @param type $topicOptions
  241. * @param type $posterOptions
  242. */
  243. public function postCreated($msgOptions, $topicOptions, $posterOptions)
  244. {
  245. global $modSettings, $smcFunc;
  246. $customIndexSettings = unserialize($modSettings['search_custom_index_config']);
  247. $inserts = array();
  248. foreach (text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true) as $word)
  249. $inserts[] = array($word, $msgOptions['id']);
  250. if (!empty($inserts))
  251. $smcFunc['db_insert']('ignore',
  252. '{db_prefix}log_search_words',
  253. array('id_word' => 'int', 'id_msg' => 'int'),
  254. $inserts,
  255. array('id_word', 'id_msg')
  256. );
  257. }
  258. /**
  259. * After a post is modified, we update the search index database.
  260. *
  261. * @param type $msgOptions
  262. * @param type $topicOptions
  263. * @param type $posterOptions
  264. */
  265. public function postModified($msgOptions, $topicOptions, $posterOptions)
  266. {
  267. global $modSettings, $smcFunc;
  268. if (isset($msgOptions['body']))
  269. {
  270. $customIndexSettings = unserialize($modSettings['search_custom_index_config']);
  271. $stopwords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
  272. $old_body = isset($msgOptions['old_body']) ? $msgOptions['old_body'] : '';
  273. // create thew new and old index
  274. $old_index = text2words($old_body, $customIndexSettings['bytes_per_word'], true);
  275. $new_index = text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true);
  276. // Calculate the words to be added and removed from the index.
  277. $removed_words = array_diff(array_diff($old_index, $new_index), $stopwords);
  278. $inserted_words = array_diff(array_diff($new_index, $old_index), $stopwords);
  279. // Delete the removed words AND the added ones to avoid key constraints.
  280. if (!empty($removed_words))
  281. {
  282. $removed_words = array_merge($removed_words, $inserted_words);
  283. $smcFunc['db_query']('', '
  284. DELETE FROM {db_prefix}log_search_words
  285. WHERE id_msg = {int:id_msg}
  286. AND id_word IN ({array_int:removed_words})',
  287. array(
  288. 'removed_words' => $removed_words,
  289. 'id_msg' => $msgOptions['id'],
  290. )
  291. );
  292. }
  293. // Add the new words to be indexed.
  294. if (!empty($inserted_words))
  295. {
  296. $inserts = array();
  297. foreach ($inserted_words as $word)
  298. $inserts[] = array($word, $msgOptions['id']);
  299. $smcFunc['db_insert']('insert',
  300. '{db_prefix}log_search_words',
  301. array('id_word' => 'string', 'id_msg' => 'int'),
  302. $inserts,
  303. array('id_word', 'id_msg')
  304. );
  305. }
  306. }
  307. }
  308. }