SearchAPI-Custom.php 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. <?php
  2. /**
  3. * Simple Machines Forum (SMF)
  4. *
  5. * @package SMF
  6. * @author Simple Machines http://www.simplemachines.org
  7. * @copyright 2011 Simple Machines
  8. * @license http://www.simplemachines.org/about/smf/license.php BSD
  9. *
  10. * @version 2.1 Alpha 1
  11. */
  12. if (!defined('SMF'))
  13. die('Hacking attempt...');
  14. class custom_search
  15. {
  16. // This is the last version of SMF that this was tested on, to protect against API changes.
  17. public $version_compatible = 'SMF 2.1 Alpha 1';
  18. // This won't work with versions of SMF less than this.
  19. public $min_smf_version = 'SMF 2.1 Alpha 1';
  20. // Is it supported?
  21. public $is_supported = true;
  22. protected $indexSettings = array();
  23. // What words are banned?
  24. protected $bannedWords = array();
  25. // What is the minimum word length?
  26. protected $min_word_length = null;
  27. // What databases support the custom index?
  28. protected $supported_databases = array('mysql', 'postgresql', 'sqlite');
  29. public function __construct()
  30. {
  31. global $modSettings, $db_type;
  32. // Is this database supported?
  33. if (!in_array($db_type, $this->supported_databases))
  34. {
  35. $this->is_supported = false;
  36. return;
  37. }
  38. if (empty($modSettings['search_custom_index_config']))
  39. return;
  40. $this->indexSettings = unserialize($modSettings['search_custom_index_config']);
  41. $this->bannedWords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
  42. $this->min_word_length = $this->indexSettings['bytes_per_word'];
  43. }
  44. // Check whether the search can be performed by this API.
  45. public function supportsMethod($methodName, $query_params = null)
  46. {
  47. switch ($methodName)
  48. {
  49. case 'isValid':
  50. case 'searchSort':
  51. case 'prepareIndexes':
  52. case 'indexedWordQuery':
  53. case 'postCreated':
  54. case 'postModified':
  55. return true;
  56. break;
  57. // All other methods, too bad dunno you.
  58. default:
  59. return false;
  60. return;
  61. }
  62. }
  63. // If the settings don't exist we can't continue.
  64. public function isValid()
  65. {
  66. global $modSettings;
  67. return !empty($modSettings['search_custom_index_config']);
  68. }
  69. /**
  70. * callback function for usort used to sort the fulltext results.
  71. * the order of sorting is: large words, small words, large words that
  72. * are excluded from the search, small words that are excluded.
  73. * @param string $a Word A
  74. * @param string $b Word B
  75. * @return int
  76. */
  77. public function searchSort($a, $b)
  78. {
  79. global $modSettings, $excludedWords;
  80. $x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
  81. $y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
  82. return $y < $x ? 1 : ($y > $x ? -1 : 0);
  83. }
  84. // Do we have to do some work with the words we are searching for to prepare them?
  85. public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
  86. {
  87. global $modSettings, $smcFunc;
  88. $subwords = text2words($word, $this->min_word_length, true);
  89. if (empty($modSettings['search_force_index']))
  90. $wordsSearch['words'][] = $word;
  91. // Excluded phrases don't benefit from being split into subwords.
  92. if (count($subwords) > 1 && $isExcluded)
  93. continue;
  94. else
  95. {
  96. foreach ($subwords as $subword)
  97. {
  98. if ($smcFunc['strlen']($subword) >= $this->min_word_length && !in_array($subword, $this->bannedWords))
  99. {
  100. $wordsSearch['indexed_words'][] = $subword;
  101. if ($isExcluded)
  102. $wordsExclude[] = $subword;
  103. }
  104. }
  105. }
  106. }
  107. // Search for indexed words.
  108. public function indexedWordQuery($words, $search_data)
  109. {
  110. global $modSettings, $smcFunc;
  111. $query_select = array(
  112. 'id_msg' => 'm.id_msg',
  113. );
  114. $query_inner_join = array();
  115. $query_left_join = array();
  116. $query_where = array();
  117. $query_params = $search_data['params'];
  118. if ($query_params['id_search'])
  119. $query_select['id_search'] = '{int:id_search}';
  120. $count = 0;
  121. foreach ($words['words'] as $regularWord)
  122. {
  123. $query_where[] = 'm.body' . (in_array($regularWord, $query_params['excluded_words']) ? ' NOT' : '') . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:complex_body_' . $count . '}';
  124. $query_params['complex_body_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($regularWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $regularWord), '\\\'') . '[[:>:]]';
  125. }
  126. if ($query_params['user_query'])
  127. $query_where[] = '{raw:user_query}';
  128. if ($query_params['board_query'])
  129. $query_where[] = 'm.id_board {raw:board_query}';
  130. if ($query_params['topic'])
  131. $query_where[] = 'm.id_topic = {int:topic}';
  132. if ($query_params['min_msg_id'])
  133. $query_where[] = 'm.id_msg >= {int:min_msg_id}';
  134. if ($query_params['max_msg_id'])
  135. $query_where[] = 'm.id_msg <= {int:max_msg_id}';
  136. $count = 0;
  137. if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
  138. foreach ($query_params['excluded_phrases'] as $phrase)
  139. {
  140. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:exclude_subject_phrase_' . $count . '}';
  141. $query_params['exclude_subject_phrase_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($phrase, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $phrase), '\\\'') . '[[:>:]]';
  142. }
  143. $count = 0;
  144. if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
  145. foreach ($query_params['excluded_subject_words'] as $excludedWord)
  146. {
  147. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : ' RLIKE ') . '{string:exclude_subject_words_' . $count . '}';
  148. $query_params['exclude_subject_words_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($excludedWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $excludedWord), '\\\'') . '[[:>:]]';
  149. }
  150. $numTables = 0;
  151. $prev_join = 0;
  152. foreach ($words['indexed_words'] as $indexedWord)
  153. {
  154. $numTables++;
  155. if (in_array($indexedWord, $query_params['excluded_index_words']))
  156. {
  157. $query_left_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_word = ' . $indexedWord . ' AND lsw' . $numTables . '.id_msg = m.id_msg)';
  158. $query_where[] = '(lsw' . $numTables . '.id_word IS NULL)';
  159. }
  160. else
  161. {
  162. $query_inner_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_msg = ' . ($prev_join === 0 ? 'm' : 'lsw' . $prev_join) . '.id_msg)';
  163. $query_where[] = 'lsw' . $numTables . '.id_word = ' . $indexedWord;
  164. $prev_join = $numTables;
  165. }
  166. }
  167. $ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ( '
  168. INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
  169. (' . implode(', ', array_keys($query_select)) . ')') : '') . '
  170. SELECT ' . implode(', ', $query_select) . '
  171. FROM {db_prefix}messages AS m' . (empty($query_inner_join) ? '' : '
  172. INNER JOIN ' . implode('
  173. INNER JOIN ', $query_inner_join)) . (empty($query_left_join) ? '' : '
  174. LEFT JOIN ' . implode('
  175. LEFT JOIN ', $query_left_join)) . '
  176. WHERE ' . implode('
  177. AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
  178. LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
  179. $query_params
  180. );
  181. return $ignoreRequest;
  182. }
  183. /**
  184. * After a post is made, we update the database.
  185. */
  186. public function postCreated($msgOptions, $topicOptions, $posterOptions)
  187. {
  188. global $modSettings, $smcFunc;
  189. $customIndexSettings = unserialize($modSettings['search_custom_index_config']);
  190. $inserts = array();
  191. foreach (text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true) as $word)
  192. $inserts[] = array($word, $msgOptions['id']);
  193. if (!empty($inserts))
  194. $smcFunc['db_insert']('ignore',
  195. '{db_prefix}log_search_words',
  196. array('id_word' => 'int', 'id_msg' => 'int'),
  197. $inserts,
  198. array('id_word', 'id_msg')
  199. );
  200. }
  201. /**
  202. * After a post is modified, we update the database.
  203. */
  204. public function postModified($msgOptions, $topicOptions, $posterOptions)
  205. {
  206. global $modSettings, $smcFunc;
  207. $customIndexSettings = unserialize($modSettings['search_custom_index_config']);
  208. $stopwords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
  209. $old_index = text2words($msgOptions['old_body'], $customIndexSettings['bytes_per_word'], true);
  210. $new_index = text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true);
  211. // Calculate the words to be added and removed from the index.
  212. $removed_words = array_diff(array_diff($old_index, $new_index), $stopwords);
  213. $inserted_words = array_diff(array_diff($new_index, $old_index), $stopwords);
  214. // Delete the removed words AND the added ones to avoid key constraints.
  215. if (!empty($removed_words))
  216. {
  217. $removed_words = array_merge($removed_words, $inserted_words);
  218. $smcFunc['db_query']('', '
  219. DELETE FROM {db_prefix}log_search_words
  220. WHERE id_msg = {int:id_msg}
  221. AND id_word IN ({array_int:removed_words})',
  222. array(
  223. 'removed_words' => $removed_words,
  224. 'id_msg' => $msgOptions['id'],
  225. )
  226. );
  227. }
  228. // Add the new words to be indexed.
  229. if (!empty($inserted_words))
  230. {
  231. $inserts = array();
  232. foreach ($inserted_words as $word)
  233. $inserts[] = array($word, $msgOptions['id']);
  234. $smcFunc['db_insert']('insert',
  235. '{db_prefix}log_search_words',
  236. array('id_word' => 'string', 'id_msg' => 'int'),
  237. $inserts,
  238. array('id_word', 'id_msg')
  239. );
  240. }
  241. }
  242. }