SearchService.php: fix regex, parsing, group word chunks, use class, expose weighting

This commit is contained in:
Zankaria 2025-04-18 16:40:34 +02:00
parent 7b456a67e4
commit 202ed4c108

View file

@ -2,7 +2,7 @@
namespace Vichan\Service; namespace Vichan\Service;
use Vichan\Data\Driver\LogDriver; use Vichan\Data\Driver\LogDriver;
use Vichan\Data\UserPostQueries; use Vichan\Data\{UserPostQueries, SearchFilters, SearchFiltersWeighted};
class SearchService { class SearchService {
@ -31,14 +31,6 @@ class SearchService {
private const MAX_LENGTH_SUBJECT = 100; // posts.sql private const MAX_LENGTH_SUBJECT = 100; // posts.sql
private const MAX_LENGTH_NAME = 35; // posts.sql private const MAX_LENGTH_NAME = 35; // posts.sql
private const SEARCH_FILTER_BODY = 'body';
private const SEARCH_FILTER_SUBJECT = 'subject';
private const SEARCH_FILTER_NAME = 'name';
private const SEARCH_FILTER_BOARD = 'board';
private const SEARCH_FILTER_FLAG = 'flag';
private const SEARCH_FILTER_ID = 'id';
private const SEARCH_FILTER_THREAD = 'thread';
private LogDriver $log; private LogDriver $log;
private UserPostQueries $user_queries; private UserPostQueries $user_queries;
private ?array $flag_map; private ?array $flag_map;
@ -70,16 +62,22 @@ class SearchService {
return null; return null;
} }
private function sanitizeAndTransform(string $str): ?string { private static function trimEnd(string $str): string {
return \rtrim($str, "* \n\r\t\v\0");
}
private function sanitizeAndTransform(string $str): array {
// Escape UserQueries's wildcards. // Escape UserQueries's wildcards.
$str = $this->user_queries->escapeSearchPosts($str); $str = $this->user_queries->escapeSearchPosts($str);
// Coalesce multiple wildcards. // Coalesce multiple wildcards.
$str = \preg_replace_callback('/(?<!\\\\)(?:\\\\\\\\)*\*+/', function($match) { $wildcard_count = 0;
$str = \preg_replace_callback('/(?:\\\\\\\\)*\\\\\*|(?:\\\\\\\\)*\*+/', function($match) use (&$wildcard_count) {
$wildcard_count++;
return UserPostQueries::SEARCH_POSTS_WILDCARD; return UserPostQueries::SEARCH_POSTS_WILDCARD;
}, $str); }, $str);
// Query is too broad. // Query is too broad.
if ($str === UserPostQueries::SEARCH_POSTS_WILDCARD) { if ($str === UserPostQueries::SEARCH_POSTS_WILDCARD) {
return null; return [ null, 0 ];
} }
// Unescape. // Unescape.
$str = \strtr($str, [ $str = \strtr($str, [
@ -88,10 +86,10 @@ class SearchService {
'\\"' => '"' '\\"' => '"'
]); ]);
return $str; return [ $str, $wildcard_count ];
} }
private static function weightByContent(string $str) { private static function weightByContent(string $str): float {
$w = 1; $w = 1;
// Count common and short words. // Count common and short words.
@ -102,35 +100,17 @@ class SearchService {
if (\in_array($word, self::COMMON_WORDS)) { if (\in_array($word, self::COMMON_WORDS)) {
$w += $short ? 16 : 6; $w += $short ? 16 : 6;
} elseif ($short) { } elseif ($short) {
$w += 8; $w += 6;
} }
} }
return $w; return $w;
} }
private static function weightByWildcards(string $str, string $wildcard): float { private static function weightByWildcards(string $str, int $wildcards): float {
$no_end = \rtrim($str, "$wildcard \n\r\t\v\0"); // Wildcards over the total length of the word.
$perc = $wildcards / \strlen($str) * 100;
// Add just 1 if the wildcard is at the end. return $perc + $wildcards * 2;
$w = (\strlen($str) !== \strlen($no_end)) ? 1 : 0;
// Count only unescaped wildcards.
$esc_wildcard = \preg_quote($wildcard);
$non_end_wildcards = \preg_match_all("/$esc_wildcard(?!$esc_wildcard)/", $no_end);
if ($non_end_wildcards === 0) {
return 0;
}
$w += $non_end_wildcards * 2;
// Non-end wildcards over the total length of the word.
$perc = $non_end_wildcards / \strlen($str) * 100;
if ($perc > 40) {
return $w + $perc * 2;
} else {
return $w + $perc;
}
} }
private function matchFlag(string $query): array { private function matchFlag(string $query): array {
@ -147,94 +127,50 @@ class SearchService {
return $acc; return $acc;
} }
private function filterFilters(array &$filters): array { /**
$weight = 0; * Parses a raw search query.
*
if ($filters[self::SEARCH_FILTER_SUBJECT] !== null) { * @param string $raw_query Raw user query. Phrases are searched in the post bodies. The user can specify also
if (\strlen($filters[self::SEARCH_FILTER_SUBJECT]) > self::MAX_LENGTH_SUBJECT) { * additional filters in the <key>:<value> format.
$filters[self::SEARCH_FILTER_SUBJECT] = null; * Available filters:
} else { * - board: the board, value can be quoted
$str = $this->sanitizeAndTransform($filters[self::SEARCH_FILTER_SUBJECT]); * - subject: post subject, value can be quoted, supports wildcards
$str = \trim($str, "* \n\r\t\v\0"); * - name: post name, value can be quoted, supports wildcards
$weight += self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD); * - flag: post flag, value can be quoted, supports wildcards
$filters[self::SEARCH_FILTER_SUBJECT] = $str; * - id: post id, must be numeric
} * - thread: thread id, must be numeric
} * The remaining text is split into chunks and searched in the post body.
if ($filters[self::SEARCH_FILTER_NAME] !== null) { * @return SearchFilters
if (\strlen($filters[self::SEARCH_FILTER_NAME]) > self::MAX_LENGTH_NAME) { */
$filters[self::SEARCH_FILTER_NAME] = null; public function parse(string $raw_query): SearchFilters {
} else {
$str = $this->sanitizeAndTransform($filters[self::SEARCH_FILTER_NAME]);
$str = \trim($str, "* \n\r\t\v\0");
$weight += self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD);
$filters[self::SEARCH_FILTER_NAME] = $str;
}
}
if ($filters[self::SEARCH_FILTER_FLAG] !== null) {
$max_flag_length = \array_reduce($this->flag_map, function($current_max, $str) {
return \max($current_max, \strlen($str));
}, 0);
if ($this->flag_map === null || empty($this->flag_map) || \strlen($filters[self::SEARCH_FILTER_FLAG]) > $max_flag_length) {
$filters[self::SEARCH_FILTER_FLAG] = null;
} else {
$str = \trim($str, "* \n\r\t\v\0");
$weight += self::weightByWildcards($str, '*');
$filters[self::SEARCH_FILTER_FLAG] = $str;
}
}
if ($filters[self::SEARCH_FILTER_BODY] !== null) {
$acc = [];
foreach ($filters[self::SEARCH_FILTER_BODY] as $str) {
$str = \trim($str, "* \n\r\t\v\0");
$w = self::weightByContent($str) + self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD);
if ($w + $weight <= $this->max_weight) {
$weight += $w;
$acc[] = $str;
}
}
$filters[self::SEARCH_FILTER_BODY] = $acc;
}
return [ $filters, $weight ];
}
public function parse(string $raw_query): array {
$tres = self::truncateQuery($raw_query, $this->max_query_length); $tres = self::truncateQuery($raw_query, $this->max_query_length);
if ($tres === null) { if ($tres === null) {
throw new \RuntimeException('Could not truncate query'); throw new \RuntimeException('Could not truncate query');
} }
$pres = \preg_match_all( $pres = \preg_match_all(
'/ '/(?:
(?:
\b(board): \b(board):
(?: (?:
"([^"]*)" # [2] quoted (no wildcards) "([^"]+)" # [2] board: "quoted"
| |
(\S+) # [3] unquoted (no wildcards) ([^\s"]+) # [3] board: unquoted
)
) )
| |
(?:
\b(subject|name|flag): \b(subject|name|flag):
(?: (?:
"((?:\\\\|\\"|\\\*|[^"\\\\*])*)" # [5] quoted with wildcards "((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [5] quoted with wildcards
| |
((?:\\\\|\\\*|[^\s\\\\*])+)+ # [6] unquoted with wildcards ((?:\\\\\\\\|\\\\\*|[^\s\\\\])+) # [6] unquoted with wildcards
)
) )
| |
LogDriver $log) \b(id|thread):
(\d+) # [8] numeric only
| |
(?: "((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [9] quoted free text
"((?:\\\\|\\"|\\\*|[^"\\\\*])*)" # [9] quoted chunk
| |
((?:\\\\|\\\*|[^\s\\\\*])+)+ # [10] unquoted chunk ([^"\s]+(?:\s+(?!\b(?:board|subject|name|flag|id|thread):)[^"\s]+)*) # [10] unquoted free text block
) )/iux',
/iux',
$tres, $tres,
$matches, $matches,
\PREG_SET_ORDER \PREG_SET_ORDER
@ -243,43 +179,39 @@ class SearchService {
throw new \RuntimeException('Could not decode the query'); throw new \RuntimeException('Could not decode the query');
} }
$filters = [ $filters = new SearchFilters();
self::SEARCH_FILTER_BODY => [],
self::SEARCH_FILTER_SUBJECT => null,
self::SEARCH_FILTER_NAME => null,
self::SEARCH_FILTER_BOARD => null,
self::SEARCH_FILTER_FLAG => null,
self::SEARCH_FILTER_ID => null,
self::SEARCH_FILTER_THREAD => null
];
foreach ($matches as $m) { foreach ($matches as $m) {
if (!empty($m[1])) { if (!empty($m[1])) {
// board (no wildcards). // board (no wildcards).
$value = \trim($m[2] ?? $m[3], '/'); $value = \trim(!empty($m[2]) ? $m[2] : $m[3], '/');
$filters[self::SEARCH_FILTER_BOARD] = $value; $filters->board = $value;
} elseif (!empty($m[4])) { } elseif (!empty($m[4])) {
// subject, name, flag (with wildcards). // subject, name, flag (with wildcards).
$key = $m[4]; $key = \strtolower($m[4]);
$value = $m[5] ?? $m[6]; $value = !empty($m[5]) ? $m[5] : $m[6];
if ($key === 'name') { if ($key === 'name') {
$filters[self::SEARCH_FILTER_NAME] = $value; $filters->name = $value;
} elseif ($key === 'subject') { } elseif ($key === 'subject') {
$filters[self::SEARCH_FILTER_SUBJECT] = $value; $filters->subject = $value;
} else { } else {
$filters[self::SEARCH_FILTER_FLAG] = $value; $filters->flag = $value;
} }
} elseif (!empty($m[7])) { } elseif (!empty($m[7])) {
$key = $m[7]; $key = \strtolower($m[7]);
$value = (int)$m[8]; $value = (int)$m[8];
$filters[$key] = $value; if ($key === 'id') {
$filters->id = $value;
} else {
$filters->thread = $value;
}
} elseif (!empty($m[9]) || !empty($m[10])) { } elseif (!empty($m[9]) || !empty($m[10])) {
$value = $m[9] ?? $m[10]; $value = !empty($m[9]) ? $m[9] : $m[10];
$filters[self::SEARCH_FILTER_BODY] = $value; $filters->body[] = $value;
} }
} }
@ -299,15 +231,92 @@ class SearchService {
$this->post_limit = $post_limit; $this->post_limit = $post_limit;
} }
/**
* Reduces the user provided filters and assigns them a total weight.
*
* @param SearchFilters $filters The filters to sanitize, reduce and weight.
* @return SearchFiltersWeighted
*/
public function reduceAndWeight(SearchFilters $filters): SearchFiltersWeighted {
$weighted = new SearchFiltersWeighted();
if ($filters->subject !== null) {
if (\strlen($filters->subject) > self::MAX_LENGTH_SUBJECT) {
$filters->subject = null;
} else {
list($str, $wildcards) = $this->sanitizeAndTransform($filters->subject);
if ($str === null) {
$filters->subject = null;
} else {
$str = self::trimEnd($str);
$weighted->weight += self::weightByWildcards($str, $wildcards);
$filters->subject = $str;
}
}
}
if ($filters->name !== null) {
if (\strlen($filters->name) > self::MAX_LENGTH_NAME) {
$filters->name = null;
} else {
list($str, $wildcards) = $this->sanitizeAndTransform($filters->name);
if ($str === null) {
$filters->name = null;
} else {
$str = self::trimEnd($str);
$weighted->weight += self::weightByWildcards($str, $wildcards);
$filters->name = $str;
}
}
}
if ($filters->flag !== null) {
$max_flag_length = \array_reduce($this->flag_map, function($current_max, $str) {
return \max($current_max, \strlen($str));
}, 0);
if ($this->flag_map === null
|| empty($this->flag_map)
// Add 2 to account for possible wildcards on the ends.
|| \strlen($filters->flag) > $max_flag_length + 2) {
$filters->flag = null;
} else {
$str = \trim($str);
$weighted->weight += self::weightByWildcards($str, $wildcards);
$filters->flag = $str;
}
}
if ($filters->body !== null) {
$acc = [];
foreach ($filters->body as $str) {
$str = self::trimEnd($str);
list($str, $wildcards) = $this->sanitizeAndTransform($str);
if ($str !== null && !empty($str)) {
$w_content = self::weightByContent($str);
$w_wildcards = self::weightByWildcards($str, $wildcards);
$w = $w_content + $w_wildcards;
if ($w + $weighted->weight <= $this->max_weight) {
$weighted->weight += $w;
$acc[] = $str;
}
}
}
$filters->body = $acc;
}
return $weighted;
}
/** /**
* Run a search on user posts with the given filters. * Run a search on user posts with the given filters.
* *
* @param array $filters An array of filters made by {@see self::parse()}. * @param SearchFiltersWeighted $filters An array of filters made by {@see self::parse()}.
* @param ?string $fallback_board Fallback board if there isn't a board filter. * @param ?string $fallback_board Fallback board if there isn't a board filter.
* @return array Data array straight from the PDO, with all the fields in posts.sql * @return array Data array straight from the PDO, with all the fields in posts.sql
*/ */
public function search(string $ip, string $raw_query, array $filters, ?string $fallback_board): array { public function search(string $ip, string $raw_query, SearchFiltersWeighted $filters, ?string $fallback_board): array {
$board = $filters[self::SEARCH_FILTER_BOARD] ?? $fallback_board; $board = $filters->board ?? $fallback_board;
if ($board === null) { if ($board === null) {
return []; return [];
} }
@ -317,26 +326,24 @@ class SearchService {
return []; return [];
} }
list($filters, $weight) = $this->filterFilters($filters); $weight_perc = ($filters->weight / $this->max_weight) * 100;
$weight_perc = ($weight / $this->max_weight) * 100;
if ($weight_perc > 85) { if ($weight_perc > 85) {
/// Over 85 of the weight. /// Over 85 of the weight.
$this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ($weight) query '$raw_query'"); $this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
} else { } else {
$this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ($weight) query '$raw_query'"); $this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
} }
$flags = $filters[self::SEARCH_FILTER_FLAG] !== null ? $this->matchFlag($filters[self::SEARCH_FILTER_FLAG]) : null; $flags = $filters->flag !== null ? $this->matchFlag($filters->flag) : null;
return $this->user_queries->searchPosts( return $this->user_queries->searchPosts(
$board, $board,
$filters[self::SEARCH_FILTER_SUBJECT], $filters->subject,
$filters[self::SEARCH_FILTER_NAME], $filters->name,
$flags, $flags,
$filters[self::SEARCH_FILTER_ID], $filters->id,
$filters[self::SEARCH_FILTER_THREAD], $filters->thread,
$filters[self::SEARCH_FILTER_BODY], $filters->body,
$this->post_limit $this->post_limit
); );
} }