SearchService.php: fix regex, parsing, group word chunks, use class, expose weighting
This commit is contained in:
parent
7b456a67e4
commit
202ed4c108
1 changed files with 151 additions and 144 deletions
|
@ -2,7 +2,7 @@
|
|||
namespace Vichan\Service;
|
||||
|
||||
use Vichan\Data\Driver\LogDriver;
|
||||
use Vichan\Data\UserPostQueries;
|
||||
use Vichan\Data\{UserPostQueries, SearchFilters, SearchFiltersWeighted};
|
||||
|
||||
|
||||
class SearchService {
|
||||
|
@ -31,14 +31,6 @@ class SearchService {
|
|||
private const MAX_LENGTH_SUBJECT = 100; // posts.sql
|
||||
private const MAX_LENGTH_NAME = 35; // posts.sql
|
||||
|
||||
private const SEARCH_FILTER_BODY = 'body';
|
||||
private const SEARCH_FILTER_SUBJECT = 'subject';
|
||||
private const SEARCH_FILTER_NAME = 'name';
|
||||
private const SEARCH_FILTER_BOARD = 'board';
|
||||
private const SEARCH_FILTER_FLAG = 'flag';
|
||||
private const SEARCH_FILTER_ID = 'id';
|
||||
private const SEARCH_FILTER_THREAD = 'thread';
|
||||
|
||||
private LogDriver $log;
|
||||
private UserPostQueries $user_queries;
|
||||
private ?array $flag_map;
|
||||
|
@ -70,16 +62,22 @@ class SearchService {
|
|||
return null;
|
||||
}
|
||||
|
||||
private function sanitizeAndTransform(string $str): ?string {
|
||||
private static function trimEnd(string $str): string {
|
||||
return \rtrim($str, "* \n\r\t\v\0");
|
||||
}
|
||||
|
||||
private function sanitizeAndTransform(string $str): array {
|
||||
// Escape UserQueries's wildcards.
|
||||
$str = $this->user_queries->escapeSearchPosts($str);
|
||||
// Coalesce multiple wildcards.
|
||||
$str = \preg_replace_callback('/(?<!\\\\)(?:\\\\\\\\)*\*+/', function($match) {
|
||||
$wildcard_count = 0;
|
||||
$str = \preg_replace_callback('/(?:\\\\\\\\)*\\\\\*|(?:\\\\\\\\)*\*+/', function($match) use (&$wildcard_count) {
|
||||
$wildcard_count++;
|
||||
return UserPostQueries::SEARCH_POSTS_WILDCARD;
|
||||
}, $str);
|
||||
// Query is too broad.
|
||||
if ($str === UserPostQueries::SEARCH_POSTS_WILDCARD) {
|
||||
return null;
|
||||
return [ null, 0 ];
|
||||
}
|
||||
// Unescape.
|
||||
$str = \strtr($str, [
|
||||
|
@ -88,10 +86,10 @@ class SearchService {
|
|||
'\\"' => '"'
|
||||
]);
|
||||
|
||||
return $str;
|
||||
return [ $str, $wildcard_count ];
|
||||
}
|
||||
|
||||
private static function weightByContent(string $str) {
|
||||
private static function weightByContent(string $str): float {
|
||||
$w = 1;
|
||||
|
||||
// Count common and short words.
|
||||
|
@ -102,35 +100,17 @@ class SearchService {
|
|||
if (\in_array($word, self::COMMON_WORDS)) {
|
||||
$w += $short ? 16 : 6;
|
||||
} elseif ($short) {
|
||||
$w += 8;
|
||||
$w += 6;
|
||||
}
|
||||
}
|
||||
|
||||
return $w;
|
||||
}
|
||||
|
||||
private static function weightByWildcards(string $str, string $wildcard): float {
|
||||
$no_end = \rtrim($str, "$wildcard \n\r\t\v\0");
|
||||
|
||||
// Add just 1 if the wildcard is at the end.
|
||||
$w = (\strlen($str) !== \strlen($no_end)) ? 1 : 0;
|
||||
|
||||
// Count only unescaped wildcards.
|
||||
$esc_wildcard = \preg_quote($wildcard);
|
||||
$non_end_wildcards = \preg_match_all("/$esc_wildcard(?!$esc_wildcard)/", $no_end);
|
||||
if ($non_end_wildcards === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
$w += $non_end_wildcards * 2;
|
||||
|
||||
// Non-end wildcards over the total length of the word.
|
||||
$perc = $non_end_wildcards / \strlen($str) * 100;
|
||||
if ($perc > 40) {
|
||||
return $w + $perc * 2;
|
||||
} else {
|
||||
return $w + $perc;
|
||||
}
|
||||
private static function weightByWildcards(string $str, int $wildcards): float {
|
||||
// Wildcards over the total length of the word.
|
||||
$perc = $wildcards / \strlen($str) * 100;
|
||||
return $perc + $wildcards * 2;
|
||||
}
|
||||
|
||||
private function matchFlag(string $query): array {
|
||||
|
@ -147,94 +127,50 @@ class SearchService {
|
|||
return $acc;
|
||||
}
|
||||
|
||||
private function filterFilters(array &$filters): array {
|
||||
$weight = 0;
|
||||
|
||||
if ($filters[self::SEARCH_FILTER_SUBJECT] !== null) {
|
||||
if (\strlen($filters[self::SEARCH_FILTER_SUBJECT]) > self::MAX_LENGTH_SUBJECT) {
|
||||
$filters[self::SEARCH_FILTER_SUBJECT] = null;
|
||||
} else {
|
||||
$str = $this->sanitizeAndTransform($filters[self::SEARCH_FILTER_SUBJECT]);
|
||||
$str = \trim($str, "* \n\r\t\v\0");
|
||||
$weight += self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD);
|
||||
$filters[self::SEARCH_FILTER_SUBJECT] = $str;
|
||||
}
|
||||
}
|
||||
if ($filters[self::SEARCH_FILTER_NAME] !== null) {
|
||||
if (\strlen($filters[self::SEARCH_FILTER_NAME]) > self::MAX_LENGTH_NAME) {
|
||||
$filters[self::SEARCH_FILTER_NAME] = null;
|
||||
} else {
|
||||
$str = $this->sanitizeAndTransform($filters[self::SEARCH_FILTER_NAME]);
|
||||
$str = \trim($str, "* \n\r\t\v\0");
|
||||
$weight += self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD);
|
||||
$filters[self::SEARCH_FILTER_NAME] = $str;
|
||||
}
|
||||
}
|
||||
if ($filters[self::SEARCH_FILTER_FLAG] !== null) {
|
||||
$max_flag_length = \array_reduce($this->flag_map, function($current_max, $str) {
|
||||
return \max($current_max, \strlen($str));
|
||||
}, 0);
|
||||
|
||||
if ($this->flag_map === null || empty($this->flag_map) || \strlen($filters[self::SEARCH_FILTER_FLAG]) > $max_flag_length) {
|
||||
$filters[self::SEARCH_FILTER_FLAG] = null;
|
||||
} else {
|
||||
$str = \trim($str, "* \n\r\t\v\0");
|
||||
$weight += self::weightByWildcards($str, '*');
|
||||
$filters[self::SEARCH_FILTER_FLAG] = $str;
|
||||
}
|
||||
}
|
||||
|
||||
if ($filters[self::SEARCH_FILTER_BODY] !== null) {
|
||||
$acc = [];
|
||||
foreach ($filters[self::SEARCH_FILTER_BODY] as $str) {
|
||||
$str = \trim($str, "* \n\r\t\v\0");
|
||||
$w = self::weightByContent($str) + self::weightByWildcards($str, UserPostQueries::SEARCH_POSTS_WILDCARD);
|
||||
if ($w + $weight <= $this->max_weight) {
|
||||
$weight += $w;
|
||||
$acc[] = $str;
|
||||
}
|
||||
}
|
||||
|
||||
$filters[self::SEARCH_FILTER_BODY] = $acc;
|
||||
}
|
||||
|
||||
return [ $filters, $weight ];
|
||||
}
|
||||
|
||||
public function parse(string $raw_query): array {
|
||||
/**
|
||||
* Parses a raw search query.
|
||||
*
|
||||
* @param string $raw_query Raw user query. Phrases are searched in the post bodies. The user can specify also
|
||||
* additional filters in the <key>:<value> format.
|
||||
* Available filters:
|
||||
* - board: the board, value can be quoted
|
||||
* - subject: post subject, value can be quoted, supports wildcards
|
||||
* - name: post name, value can be quoted, supports wildcards
|
||||
* - flag: post flag, value can be quoted, supports wildcards
|
||||
* - id: post id, must be numeric
|
||||
* - thread: thread id, must be numeric
|
||||
* The remaining text is split into chunks and searched in the post body.
|
||||
* @return SearchFilters
|
||||
*/
|
||||
public function parse(string $raw_query): SearchFilters {
|
||||
$tres = self::truncateQuery($raw_query, $this->max_query_length);
|
||||
if ($tres === null) {
|
||||
throw new \RuntimeException('Could not truncate query');
|
||||
}
|
||||
|
||||
$pres = \preg_match_all(
|
||||
'/
|
||||
(?:
|
||||
\b(board):
|
||||
'/(?:
|
||||
\b(board):
|
||||
(?:
|
||||
"([^"]*)" # [2] quoted (no wildcards)
|
||||
"([^"]+)" # [2] board: "quoted"
|
||||
|
|
||||
(\S+) # [3] unquoted (no wildcards)
|
||||
([^\s"]+) # [3] board: unquoted
|
||||
)
|
||||
)
|
||||
|
|
||||
(?:
|
||||
\b(subject|name|flag):
|
||||
\b(subject|name|flag):
|
||||
(?:
|
||||
"((?:\\\\|\\"|\\\*|[^"\\\\*])*)" # [5] quoted with wildcards
|
||||
"((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [5] quoted with wildcards
|
||||
|
|
||||
((?:\\\\|\\\*|[^\s\\\\*])+)+ # [6] unquoted with wildcards
|
||||
((?:\\\\\\\\|\\\\\*|[^\s\\\\])+) # [6] unquoted with wildcards
|
||||
)
|
||||
)
|
||||
|
|
||||
LogDriver $log)
|
||||
\b(id|thread):
|
||||
(\d+) # [8] numeric only
|
||||
|
|
||||
(?:
|
||||
"((?:\\\\|\\"|\\\*|[^"\\\\*])*)" # [9] quoted chunk
|
||||
|
|
||||
((?:\\\\|\\\*|[^\s\\\\*])+)+ # [10] unquoted chunk
|
||||
)
|
||||
/iux',
|
||||
"((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [9] quoted free text
|
||||
|
|
||||
([^"\s]+(?:\s+(?!\b(?:board|subject|name|flag|id|thread):)[^"\s]+)*) # [10] unquoted free text block
|
||||
)/iux',
|
||||
$tres,
|
||||
$matches,
|
||||
\PREG_SET_ORDER
|
||||
|
@ -243,43 +179,39 @@ class SearchService {
|
|||
throw new \RuntimeException('Could not decode the query');
|
||||
}
|
||||
|
||||
$filters = [
|
||||
self::SEARCH_FILTER_BODY => [],
|
||||
self::SEARCH_FILTER_SUBJECT => null,
|
||||
self::SEARCH_FILTER_NAME => null,
|
||||
self::SEARCH_FILTER_BOARD => null,
|
||||
self::SEARCH_FILTER_FLAG => null,
|
||||
self::SEARCH_FILTER_ID => null,
|
||||
self::SEARCH_FILTER_THREAD => null
|
||||
];
|
||||
$filters = new SearchFilters();
|
||||
|
||||
foreach ($matches as $m) {
|
||||
if (!empty($m[1])) {
|
||||
// board (no wildcards).
|
||||
$value = \trim($m[2] ?? $m[3], '/');
|
||||
$value = \trim(!empty($m[2]) ? $m[2] : $m[3], '/');
|
||||
|
||||
$filters[self::SEARCH_FILTER_BOARD] = $value;
|
||||
$filters->board = $value;
|
||||
} elseif (!empty($m[4])) {
|
||||
// subject, name, flag (with wildcards).
|
||||
$key = $m[4];
|
||||
$value = $m[5] ?? $m[6];
|
||||
$key = \strtolower($m[4]);
|
||||
$value = !empty($m[5]) ? $m[5] : $m[6];
|
||||
|
||||
if ($key === 'name') {
|
||||
$filters[self::SEARCH_FILTER_NAME] = $value;
|
||||
$filters->name = $value;
|
||||
} elseif ($key === 'subject') {
|
||||
$filters[self::SEARCH_FILTER_SUBJECT] = $value;
|
||||
$filters->subject = $value;
|
||||
} else {
|
||||
$filters[self::SEARCH_FILTER_FLAG] = $value;
|
||||
$filters->flag = $value;
|
||||
}
|
||||
} elseif (!empty($m[7])) {
|
||||
$key = $m[7];
|
||||
$key = \strtolower($m[7]);
|
||||
$value = (int)$m[8];
|
||||
|
||||
$filters[$key] = $value;
|
||||
if ($key === 'id') {
|
||||
$filters->id = $value;
|
||||
} else {
|
||||
$filters->thread = $value;
|
||||
}
|
||||
} elseif (!empty($m[9]) || !empty($m[10])) {
|
||||
$value = $m[9] ?? $m[10];
|
||||
$value = !empty($m[9]) ? $m[9] : $m[10];
|
||||
|
||||
$filters[self::SEARCH_FILTER_BODY] = $value;
|
||||
$filters->body[] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -299,15 +231,92 @@ class SearchService {
|
|||
$this->post_limit = $post_limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces the user provided filters and assigns them a total weight.
|
||||
*
|
||||
* @param SearchFilters $filters The filters to sanitize, reduce and weight.
|
||||
* @return SearchFiltersWeighted
|
||||
*/
|
||||
public function reduceAndWeight(SearchFilters $filters): SearchFiltersWeighted {
|
||||
$weighted = new SearchFiltersWeighted();
|
||||
|
||||
if ($filters->subject !== null) {
|
||||
if (\strlen($filters->subject) > self::MAX_LENGTH_SUBJECT) {
|
||||
$filters->subject = null;
|
||||
} else {
|
||||
list($str, $wildcards) = $this->sanitizeAndTransform($filters->subject);
|
||||
if ($str === null) {
|
||||
$filters->subject = null;
|
||||
} else {
|
||||
$str = self::trimEnd($str);
|
||||
$weighted->weight += self::weightByWildcards($str, $wildcards);
|
||||
$filters->subject = $str;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($filters->name !== null) {
|
||||
if (\strlen($filters->name) > self::MAX_LENGTH_NAME) {
|
||||
$filters->name = null;
|
||||
} else {
|
||||
list($str, $wildcards) = $this->sanitizeAndTransform($filters->name);
|
||||
if ($str === null) {
|
||||
$filters->name = null;
|
||||
} else {
|
||||
$str = self::trimEnd($str);
|
||||
$weighted->weight += self::weightByWildcards($str, $wildcards);
|
||||
$filters->name = $str;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($filters->flag !== null) {
|
||||
$max_flag_length = \array_reduce($this->flag_map, function($current_max, $str) {
|
||||
return \max($current_max, \strlen($str));
|
||||
}, 0);
|
||||
|
||||
if ($this->flag_map === null
|
||||
|| empty($this->flag_map)
|
||||
// Add 2 to account for possible wildcards on the ends.
|
||||
|| \strlen($filters->flag) > $max_flag_length + 2) {
|
||||
$filters->flag = null;
|
||||
} else {
|
||||
$str = \trim($str);
|
||||
$weighted->weight += self::weightByWildcards($str, $wildcards);
|
||||
$filters->flag = $str;
|
||||
}
|
||||
}
|
||||
if ($filters->body !== null) {
|
||||
$acc = [];
|
||||
foreach ($filters->body as $str) {
|
||||
$str = self::trimEnd($str);
|
||||
list($str, $wildcards) = $this->sanitizeAndTransform($str);
|
||||
|
||||
if ($str !== null && !empty($str)) {
|
||||
$w_content = self::weightByContent($str);
|
||||
$w_wildcards = self::weightByWildcards($str, $wildcards);
|
||||
|
||||
$w = $w_content + $w_wildcards;
|
||||
if ($w + $weighted->weight <= $this->max_weight) {
|
||||
$weighted->weight += $w;
|
||||
$acc[] = $str;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$filters->body = $acc;
|
||||
}
|
||||
|
||||
return $weighted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a search on user posts with the given filters.
|
||||
*
|
||||
* @param array $filters An array of filters made by {@see self::parse()}.
|
||||
* @param SearchFiltersWeighted $filters An array of filters made by {@see self::parse()}.
|
||||
* @param ?string $fallback_board Fallback board if there isn't a board filter.
|
||||
* @return array Data array straight from the PDO, with all the fields in posts.sql
|
||||
*/
|
||||
public function search(string $ip, string $raw_query, array $filters, ?string $fallback_board): array {
|
||||
$board = $filters[self::SEARCH_FILTER_BOARD] ?? $fallback_board;
|
||||
public function search(string $ip, string $raw_query, SearchFiltersWeighted $filters, ?string $fallback_board): array {
|
||||
$board = $filters->board ?? $fallback_board;
|
||||
if ($board === null) {
|
||||
return [];
|
||||
}
|
||||
|
@ -317,26 +326,24 @@ class SearchService {
|
|||
return [];
|
||||
}
|
||||
|
||||
list($filters, $weight) = $this->filterFilters($filters);
|
||||
|
||||
$weight_perc = ($weight / $this->max_weight) * 100;
|
||||
$weight_perc = ($filters->weight / $this->max_weight) * 100;
|
||||
if ($weight_perc > 85) {
|
||||
/// Over 85 of the weight.
|
||||
$this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ($weight) query '$raw_query'");
|
||||
$this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
|
||||
} else {
|
||||
$this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ($weight) query '$raw_query'");
|
||||
$this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
|
||||
}
|
||||
|
||||
$flags = $filters[self::SEARCH_FILTER_FLAG] !== null ? $this->matchFlag($filters[self::SEARCH_FILTER_FLAG]) : null;
|
||||
$flags = $filters->flag !== null ? $this->matchFlag($filters->flag) : null;
|
||||
|
||||
return $this->user_queries->searchPosts(
|
||||
$board,
|
||||
$filters[self::SEARCH_FILTER_SUBJECT],
|
||||
$filters[self::SEARCH_FILTER_NAME],
|
||||
$filters->subject,
|
||||
$filters->name,
|
||||
$flags,
|
||||
$filters[self::SEARCH_FILTER_ID],
|
||||
$filters[self::SEARCH_FILTER_THREAD],
|
||||
$filters[self::SEARCH_FILTER_BODY],
|
||||
$filters->id,
|
||||
$filters->thread,
|
||||
$filters->body,
|
||||
$this->post_limit
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue