SearchAPI-Fulltext.php 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. <?php
  2. /**
  3. * Simple Machines Forum (SMF)
  4. *
  5. * @package SMF
  6. * @author Simple Machines http://www.simplemachines.org
  7. * @copyright 2012 Simple Machines Forum contributors
  8. * @license http://www.simplemachines.org/about/smf/license.php BSD
  9. *
  10. * @version 2.1 Alpha 1
  11. */
  12. if (!defined('SMF'))
  13. die('Hacking attempt...');
  14. /**
  15. * Fulltext API, used when an SQL fulltext index is used
  16. */
  17. class fulltext_search
  18. {
  19. /**
  20. * This is the last version of SMF that this was tested on, to protect against API changes.
  21. * @var type
  22. */
  23. public $version_compatible = 'SMF 2.1 Alpha 1';
  24. /**
  25. * This won't work with versions of SMF less than this.
  26. * @var type
  27. */
  28. public $min_smf_version = 'SMF 2.1 Alpha 1';
  29. /**
  30. * Is it supported?
  31. *
  32. * @var type
  33. */
  34. public $is_supported = true;
  35. /**
  36. * What words are banned?
  37. * @var type
  38. */
  39. protected $bannedWords = array();
  40. /**
  41. * What is the minimum word length?
  42. * @var type
  43. */
  44. protected $min_word_length = 4;
  45. /**
  46. * What databases support the fulltext index?
  47. * @var type
  48. */
  49. protected $supported_databases = array('mysql');
  50. /**
  51. * fulltext_search::__construct()
  52. *
  53. */
  54. public function __construct()
  55. {
  56. global $smcFunc, $db_connection, $modSettings, $db_type;
  57. // Is this database supported?
  58. if (!in_array($db_type, $this->supported_databases))
  59. {
  60. $this->is_supported = false;
  61. return;
  62. }
  63. $this->bannedWords = empty($modSettings['search_banned_words']) ? array() : explode(',', $modSettings['search_banned_words']);
  64. $this->min_word_length = $this->_getMinWordLength();
  65. }
  66. /**
  67. * fulltext_search::supportsMethod()
  68. *
  69. * Check whether the method can be performed by this API.
  70. *
  71. * @param mixed $methodName
  72. * @param mixed $query_params
  73. * @return
  74. */
  75. public function supportsMethod($methodName, $query_params = null)
  76. {
  77. switch ($methodName)
  78. {
  79. case 'searchSort':
  80. case 'prepareIndexes':
  81. case 'indexedWordQuery':
  82. return true;
  83. break;
  84. // All other methods, too bad dunno you.
  85. default:
  86. return false;
  87. break;
  88. }
  89. }
  90. /**
  91. * fulltext_search::_getMinWordLength()
  92. *
  93. * What is the minimum word length full text supports?
  94. *
  95. * @return
  96. */
  97. protected function _getMinWordLength()
  98. {
  99. global $smcFunc;
  100. // Try to determine the minimum number of letters for a fulltext search.
  101. $request = $smcFunc['db_search_query']('max_fulltext_length', '
  102. SHOW VARIABLES
  103. LIKE {string:fulltext_minimum_word_length}',
  104. array(
  105. 'fulltext_minimum_word_length' => 'ft_min_word_len',
  106. )
  107. );
  108. if ($request !== false && $smcFunc['db_num_rows']($request) == 1)
  109. {
  110. list (, $min_word_length) = $smcFunc['db_fetch_row']($request);
  111. $smcFunc['db_free_result']($request);
  112. }
  113. // 4 is the MySQL default...
  114. else
  115. $min_word_length = 4;
  116. return $min_word_length;
  117. }
  118. /**
  119. * callback function for usort used to sort the fulltext results.
  120. * the order of sorting is: large words, small words, large words that
  121. * are excluded from the search, small words that are excluded.
  122. *
  123. * @param string $a Word A
  124. * @param string $b Word B
  125. * @return int
  126. */
  127. public function searchSort($a, $b)
  128. {
  129. global $modSettings, $excludedWords, $smcFunc;
  130. $x = $smcFunc['strlen']($a) - (in_array($a, $excludedWords) ? 1000 : 0);
  131. $y = $smcFunc['strlen']($b) - (in_array($b, $excludedWords) ? 1000 : 0);
  132. return $x < $y ? 1 : ($x > $y ? -1 : 0);
  133. }
  134. /**
  135. * fulltext_search::prepareIndexes()
  136. *
  137. * Do we have to do some work with the words we are searching for to prepare them?
  138. *
  139. * @param mixed $word
  140. * @param mixed $wordsSearch
  141. * @param mixed $wordsExclude
  142. * @param mixed $isExcluded
  143. * @return
  144. */
  145. public function prepareIndexes($word, &$wordsSearch, &$wordsExclude, $isExcluded)
  146. {
  147. global $modSettings, $smcFunc;
  148. $subwords = text2words($word, null, false);
  149. if (empty($modSettings['search_force_index']))
  150. {
  151. // A boolean capable search engine and not forced to only use an index, we may use a non indexed search
  152. // this is harder on the server so we are restrictive here
  153. if (count($subwords) > 1 && preg_match('~[.:@$]~', $word))
  154. {
  155. // using special characters that a full index would ignore and the remaining words are short which would also be ignored
  156. if (($smcFunc['strlen'](current($subwords)) < $this->min_word_length) && ($smcFunc['strlen'](next($subwords)) < $this->min_word_length))
  157. {
  158. $wordsSearch['words'][] = trim($word, "/*- ");
  159. $wordsSearch['complex_words'][] = count($subwords) === 1 ? $word : '"' . $word . '"';
  160. }
  161. }
  162. elseif ($smcFunc['strlen'](trim($word, "/*- ")) < $this->min_word_length)
  163. {
  164. // short words have feelings too
  165. $wordsSearch['words'][] = trim($word, "/*- ");
  166. $wordsSearch['complex_words'][] = count($subwords) === 1 ? $word : '"' . $word . '"';
  167. }
  168. }
  169. $fulltextWord = count($subwords) === 1 ? $word : '"' . $word . '"';
  170. $wordsSearch['indexed_words'][] = $fulltextWord;
  171. if ($isExcluded)
  172. $wordsExclude[] = $fulltextWord;
  173. }
  174. /**
  175. * fulltext_search::indexedWordQuery()
  176. *
  177. * Search for indexed words.
  178. *
  179. * @param mixed $words
  180. * @param mixed $search_data
  181. * @return
  182. */
  183. public function indexedWordQuery($words, $search_data)
  184. {
  185. global $modSettings, $smcFunc;
  186. $query_select = array(
  187. 'id_msg' => 'm.id_msg',
  188. );
  189. $query_where = array();
  190. $query_params = $search_data['params'];
  191. if ($query_params['id_search'])
  192. $query_select['id_search'] = '{int:id_search}';
  193. $count = 0;
  194. if (empty($modSettings['search_simple_fulltext']))
  195. foreach ($words['words'] as $regularWord)
  196. {
  197. $query_where[] = 'm.body' . (in_array($regularWord, $query_params['excluded_words']) ? ' NOT' : '') . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : 'RLIKE') . '{string:complex_body_' . $count . '}';
  198. $query_params['complex_body_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($regularWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $regularWord), '\\\'') . '[[:>:]]';
  199. }
  200. if ($query_params['user_query'])
  201. $query_where[] = '{raw:user_query}';
  202. if ($query_params['board_query'])
  203. $query_where[] = 'm.id_board {raw:board_query}';
  204. if ($query_params['topic'])
  205. $query_where[] = 'm.id_topic = {int:topic}';
  206. if ($query_params['min_msg_id'])
  207. $query_where[] = 'm.id_msg >= {int:min_msg_id}';
  208. if ($query_params['max_msg_id'])
  209. $query_where[] = 'm.id_msg <= {int:max_msg_id}';
  210. $count = 0;
  211. if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
  212. foreach ($query_params['excluded_phrases'] as $phrase)
  213. {
  214. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_phrase_' . $count . '}';
  215. $query_params['exclude_subject_phrase_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($phrase, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $phrase), '\\\'') . '[[:>:]]';
  216. }
  217. $count = 0;
  218. if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
  219. foreach ($query_params['excluded_subject_words'] as $excludedWord)
  220. {
  221. $query_where[] = 'subject NOT ' . (empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? ' LIKE ' : 'RLIKE') . '{string:exclude_subject_words_' . $count . '}';
  222. $query_params['exclude_subject_words_' . $count++] = empty($modSettings['search_match_words']) || $search_data['no_regexp'] ? '%' . strtr($excludedWord, array('_' => '\\_', '%' => '\\%')) . '%' : '[[:<:]]' . addcslashes(preg_replace(array('/([\[\]$.+*?|{}()])/'), array('[$1]'), $excludedWord), '\\\'') . '[[:>:]]';
  223. }
  224. if (!empty($modSettings['search_simple_fulltext']))
  225. {
  226. $query_where[] = 'MATCH (body) AGAINST ({string:body_match})';
  227. $query_params['body_match'] = implode(' ', array_diff($words['indexed_words'], $query_params['excluded_index_words']));
  228. }
  229. else
  230. {
  231. $query_params['boolean_match'] = '';
  232. // remove any indexed words that are used in the complex body search terms
  233. $words['indexed_words'] = array_diff($words['indexed_words'], $words['complex_words']);
  234. foreach ($words['indexed_words'] as $fulltextWord)
  235. $query_params['boolean_match'] .= (in_array($fulltextWord, $query_params['excluded_index_words']) ? '-' : '+') . $fulltextWord . ' ';
  236. $query_params['boolean_match'] = substr($query_params['boolean_match'], 0, -1);
  237. // if we have bool terms to search, add them in
  238. if ($query_params['boolean_match'])
  239. $query_where[] = 'MATCH (body) AGAINST ({string:boolean_match} IN BOOLEAN MODE)';
  240. }
  241. $ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ( '
  242. INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
  243. (' . implode(', ', array_keys($query_select)) . ')') : '') . '
  244. SELECT ' . implode(', ', $query_select) . '
  245. FROM {db_prefix}messages AS m
  246. WHERE ' . implode('
  247. AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
  248. LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
  249. $query_params
  250. );
  251. return $ignoreRequest;
  252. }
  253. }
  254. ?>