SearchAPI-Fulltext.php 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. <?php
  2. /**
  3. * Simple Machines Forum (SMF)
  4. *
  5. * @package SMF
  6. * @author Simple Machines http://www.simplemachines.org
  7. * @copyright 2011 Simple Machines
  8. * @license http://www.simplemachines.org/about/smf/license.php BSD
  9. *
  10. * @version 2.1 Alpha 1
  11. */
  12. if (!defined('SMF'))
  13. die('Hacking attempt...');
  14. /*
  15. int searchSort(string $wordA, string $wordB)
  16. - callback function for usort used to sort the fulltext results.
  17. - the order of sorting is: large words, small words, large words that
  18. are excluded from the search, small words that are excluded.
  19. */
  20. class fulltext_search
  21. {
  22. // This is the last version of SMF that this was tested on, to protect against API changes.
  23. public $version_compatible = 'SMF 2.1 Alpha 1';
  24. // This won't work with versions of SMF less than this.
  25. public $min_smf_version = 'SMF 2.1 Alpha 1';
  26. // Is it supported?
  27. public $is_supported = true;
  28. // Can we do a boolean search - tested on construct.
  29. protected $canDoBooleanSearch = false;
  30. // What words are banned?
  31. protected $bannedWords = array();
  32. // What is the minimum word length?
  33. protected $min_word_length = 4;
  34. // What databases support the fulltext index?
  35. protected $supported_databases = array('mysql');
  36. public function __construct()
  37. {
  38. global $smcFunc, $db_connection, $modSettings, $db_type;
  39. // Is this database supported?
  40. if (!in_array($db_type, $this->supported_databases))
  41. {
  42. $this->is_supported = false;
  43. return;
  44. }
  45. // Some MySQL versions are superior to others :P.
  46. $this->canDoBooleanSearch = version_compare($smcFunc['db_server_info']($db_connection), '4.0.1', '>=');
  47. $this->bannedWords = empty($modSettings['search_banned_words']) ? array() : explode(',', $modSettings['search_banned_words']);
  48. $this->min_word_length = $this->_getMinWordLength();
  49. }
  50. // Check whether the method can be performed by this API.
  51. public function supportsMethod($methodName, $query_params = null)
  52. {
  53. switch ($methodName)
  54. {
  55. case 'searchSort':
  56. case 'prepareIndexes':
  57. case 'indexedWordQuery':
  58. return true;
  59. break;
  60. // All other methods, too bad dunno you.
  61. default:
  62. return false;
  63. break;
  64. }
  65. }
  66. // What is the minimum word length full text supports?
  67. protected function _getMinWordLength()
  68. {
  69. global $smcFunc;
  70. // Try to determine the minimum number of letters for a fulltext search.
  71. $request = $smcFunc['db_search_query']('max_fulltext_length', '
  72. SHOW VARIABLES
  73. LIKE {string:fulltext_minimum_word_length}',
  74. array(
  75. 'fulltext_minimum_word_length' => 'ft_min_word_len',
  76. )
  77. );
  78. if ($request !== false && $smcFunc['db_num_rows']($request) == 1)
  79. {
  80. list (, $min_word_length) = $smcFunc['db_fetch_row']($request);
  81. $smcFunc['db_free_result']($request);
  82. }
  83. // 4 is the MySQL default...
  84. else
  85. $min_word_length = 4;
  86. return $min_word_length;
  87. }
  88. // This function compares the length of two strings plus a little.
  89. public function searchSort($a, $b)
  90. {
  91. global $modSettings, $excludedWords;
  92. $x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
  93. $y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
  94. return $x < $y ? 1 : ($x > $y ? -1 : 0);
  95. }
  96. // Do we have to do some work with the words we are searching for to prepare them?
  97. public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
  98. {
  99. global $modSettings;
  100. $subwords = text2words($word, null, false);
  101. if (!$this->canDoBooleanSearch && count($subwords) > 1 && empty($modSettings['search_force_index']))
  102. $wordsSearch['words'][] = $word;
  103. if ($this->canDoBooleanSearch)
  104. {
  105. $fulltextWord = count($subwords) === 1 ? $word : '"' . $word . '"';
  106. $wordsSearch['indexed_words'][] = $fulltextWord;
  107. if ($isExcluded)
  108. $wordsExclude[] = $fulltextWord;
  109. }
  110. // Excluded phrases don't benefit from being split into subwords.
  111. elseif (count($subwords) > 1 && $isExcluded)
  112. return;
  113. else
  114. {
  115. $relyOnIndex = true;
  116. foreach ($subwords as $subword)
  117. {
  118. if (($smcFunc['strlen']($subword) >= $this->min_word_length) && !in_array($subword, $this->bannedWords))
  119. {
  120. $wordsSearch['indexed_words'][] = $subword;
  121. if ($isExcluded)
  122. $wordsExclude[] = $subword;
  123. }
  124. elseif (!in_array($subword, $this->bannedWords))
  125. $relyOnIndex = false;
  126. }
  127. if ($this->canDoBooleanSearch && !$relyOnIndex && empty($modSettings['search_force_index']))
  128. $wordsSearch['words'][] = $word;
  129. }
  130. }
  131. // Search for indexed words.
  132. public function indexedWordQuery($words, $search_data)
  133. {
  134. global $modSettings, $smcFunc;
  135. $query_select = array(
  136. 'id_msg' => 'm.id_msg',
  137. );
  138. $query_where = array();
  139. $query_params = $search_data['params'];
  140. if ($query_params['id_search'])
  141. $query_select['id_search'] = '{int:id_search}';
  142. $count = 0;
  143. if (empty($modSettings['search_simple_fulltext']))
  144. foreach ($words['words'] as $regularWord)
  145. {
  146. $query_where[] = 'm.body' . (in_array($regularWord, $query_params['excluded_words']) ? ' NOT' : '') . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:complex_body_' . $count . '}';
  147. $query_params['complex_body_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($regularWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $regularWord), '\\\'') . '[[:>:]]';
  148. }
  149. if ($query_params['user_query'])
  150. $query_where[] = '{raw:user_query}';
  151. if ($query_params['board_query'])
  152. $query_where[] = 'm.id_board {raw:board_query}';
  153. if ($query_params['topic'])
  154. $query_where[] = 'm.id_topic = {int:topic}';
  155. if ($query_params['min_msg_id'])
  156. $query_where[] = 'm.id_msg >= {int:min_msg_id}';
  157. if ($query_params['max_msg_id'])
  158. $query_where[] = 'm.id_msg <= {int:max_msg_id}';
  159. $count = 0;
  160. if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
  161. foreach ($query_params['excluded_phrases'] as $phrase)
  162. {
  163. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_phrase_' . $count . '}';
  164. $query_params['exclude_subject_phrase_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($phrase, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $phrase), '\\\'') . '[[:>:]]';
  165. }
  166. $count = 0;
  167. if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
  168. foreach ($query_params['excluded_subject_words'] as $excludedWord)
  169. {
  170. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $no_regexp ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_words_' . $count . '}';
  171. $query_params['exclude_subject_words_' . $count++] = empty($modSettings['search_match_words']) || $no_regexp ? '%' . strtr($excludedWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $excludedWord), '\\\'') . '[[:>:]]';
  172. }
  173. if (!empty($modSettings['search_simple_fulltext']))
  174. {
  175. $query_where[] = 'MATCH (body) AGAINST ({string:body_match})';
  176. $query_params['body_match'] = implode(' ', array_diff($words['indexed_words'], $query_params['excluded_index_words']));
  177. }
  178. elseif ($this->canDoBooleanSearch)
  179. {
  180. $query_params['boolean_match'] = '';
  181. foreach ($words['indexed_words'] as $fulltextWord)
  182. $query_params['boolean_match'] .= (in_array($fulltextWord, $query_params['excluded_index_words']) ? '-' : '+') . $fulltextWord . ' ';
  183. $query_params['boolean_match'] = substr($query_params['boolean_match'], 0, -1);
  184. $query_where[] = 'MATCH (body) AGAINST ({string:boolean_match} IN BOOLEAN MODE)';
  185. }
  186. else
  187. {
  188. $count = 0;
  189. foreach ($words['indexed_words'] as $fulltextWord)
  190. {
  191. $query_where[] = (in_array($fulltextWord, $query_params['excluded_index_words']) ? 'NOT ' : '') . 'MATCH (body) AGAINST ({string:fulltext_match_' . $count . '})';
  192. $query_params['fulltext_match_' . $count++] = $fulltextWord;
  193. }
  194. }
  195. $ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ( '
  196. INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
  197. (' . implode(', ', array_keys($query_select)) . ')') : '') . '
  198. SELECT ' . implode(', ', $query_select) . '
  199. FROM {db_prefix}messages AS m
  200. WHERE ' . implode('
  201. AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
  202. LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
  203. $query_params
  204. );
  205. return $ignoreRequest;
  206. }
  207. }
  208. ?>