diff --git a/inc/Service/SearchService.php b/inc/Service/SearchService.php new file mode 100644 index 00000000..e9aab2bd --- /dev/null +++ b/inc/Service/SearchService.php @@ -0,0 +1,363 @@ + '\\', + '\\*' => '*', + '\\"' => '"' + ]); + } + + /** + * Split the filter into fragments along the wildcards, handling escaping. + * + * @param string $str The full filter. + * @return array + */ + private static function split(string $str): array { + // Split the fragments + return \preg_split('/(?:\\\\\\\\)*\\\\\*|(?:\\\\\\\\)*\*+/', $str); + } + + private static function weightByContent(array $fragments): float { + $w = 0; + + foreach ($fragments as $fragment) { + $short = \strlen($fragment) < 4; + if (\in_array($fragment, self::COMMON_WORDS)) { + $w += $short ? 16 : 6; + } elseif ($short) { + $w += 6; + } + } + + return $w; + } + + private static function filterAndWeight(string $filter): array { + $fragments = self::split($filter); + $acc = []; + $total_len = 0; + + foreach ($fragments as $fragment) { + $fragment = self::trim(self::unescape($fragment)); + + if (!empty($fragment)) { + $total_len += \strlen($fragment); + $acc[] = $fragment; + } + } + + // Interword wildcards + $interword = \min(\count($fragments) - 1, 0); + // Wildcards over the total length of the word. Ergo the number of fragments minus 1. + $perc = $interword / $total_len * 100; + $wildcard_weight = $perc + \count($fragments) * 2; + + return [ $acc, $total_len, $wildcard_weight ]; + } + + /** + * Gets a subset of the given strings which match every filter. + * + * @param array $fragments User provided fragments to search in the flags. + * @param array $strings An array of strings. + * @return array An array of strings, subset of $strings. + */ + private static function matchStrings(array $strings, array $fragments): array { + return \array_filter($strings, function ($str) use ($fragments) { + // Saves the last position. We use this to ensure the fragments are one after the other. + $last_ret = 0; + foreach ($fragments as $fragment) { + if ($last_ret + 1 > \strlen($fragment)) { + // Cannot possibly match. + return false; + } + + $last_ret = \stripos($str, $fragment, $last_ret + 1); + if ($last_ret === false) { + // Exclude flags that don't much even a single fragment. + return false; + } + } + return true; + }); + } + + /** + * Parses a raw search query. + * + * @param string $raw_query Raw user query. Phrases are searched in the post bodies. The user can specify also + * additional filters in the : format. + * Available filters: + * - board: the board, value can be quoted + * - subject: post subject, value can be quoted, supports wildcards + * - name: post name, value can be quoted, supports wildcards + * - flag: post flag, value can be quoted, supports wildcards + * - id: post id, must be numeric + * - thread: thread id, must be numeric + * The remaining text is split into chunks and searched in the post body. + * @return FiltersParseResult + */ + public function parse(string $raw_query): FiltersParseResult{ + $tres = self::truncateQuery($raw_query, $this->max_query_length); + if ($tres === null) { + throw new \RuntimeException('Could not truncate query'); + } + + $pres = \preg_match_all( + '/(?: + \b(board): + (?: + "([^"]+)" # [2] board: "quoted" + | + ([^\s"]+) # [3] board: unquoted + ) + | + \b(subject|name|flag): + (?: + "((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [5] quoted with wildcards + | + ((?:\\\\\\\\|\\\\\*|[^\s\\\\])++) # [6] unquoted with wildcards + ) + | + \b(id|thread): + (\d+) # [8] numeric only + | + "((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [9] quoted free text + | + ([^"\s]++) # [10] unquoted free text block + )/iux', + $tres, + $matches, + \PREG_SET_ORDER + ); + if ($pres === false) { + throw new \RuntimeException('Could not decode the query'); + } + + $filters = new FiltersParseResult(); + + foreach ($matches as $m) { + if (!empty($m[1])) { + // board (no wildcards). + $value = \trim(!empty($m[2]) ? $m[2] : $m[3], '/'); + + $filters->board = $value; + } elseif (!empty($m[4])) { + // subject, name, flag (with wildcards). + $key = \strtolower($m[4]); + $value = !empty($m[5]) ? $m[5] : $m[6]; + + if ($key === 'name') { + $filters->name = $value; + } elseif ($key === 'subject') { + $filters->subject = $value; + } else { + $filters->flag = $value; + } + } elseif (!empty($m[7])) { + $key = \strtolower($m[7]); + $value = (int)$m[8]; + + if ($key === 'id') { + $filters->id = $value; + } else { + $filters->thread = $value; + } + } elseif (!empty($m[9]) || !empty($m[10])) { + $value = !empty($m[9]) ? $m[9] : $m[10]; + + $filters->body[] = $value; + } + } + + return $filters; + } + + /** + * @param UserPostQueries $user_queries User posts queries. + * @param ?flag_map $max_flag_length The key-value map of user flags, or null to disable flag search. + */ + public function __construct(LogDriver $log, UserPostQueries $user_queries, ?array $flag_map, float $max_weight, int $max_query_length, int $post_limit) { + $this->log = $log; + $this->user_queries = $user_queries; + $this->flag_map = $flag_map; + $this->max_weight = $max_weight; + $this->max_query_length = $max_query_length; + $this->post_limit = $post_limit; + } + + /** + * Reduces the user provided filters and assigns them a total weight. + * + * @param FiltersParseResult $filters The filters to sanitize, reduce and weight. + * @return SearchFilters + */ + public function reduceAndWeight(FiltersParseResult $filters): SearchFilters { + $weighted = new SearchFilters(); + + if ($filters->subject !== null) { + list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->subject); + + if ($total_len <= self::MAX_LENGTH_SUBJECT) { + $weighted->subject = $fragments; + $weighted->weight = $wildcard_weight; + } + } + if ($filters->name !== null) { + list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->name); + + if ($total_len <= self::MAX_LENGTH_NAME) { + $weighted->name = $fragments; + $weighted->weight += $wildcard_weight; + } + } + // No wildcard support, and obligatory anyway so it weights 0. + $weighted->board = $filters->board; + if ($filters->flag !== null) { + $weighted->flag = []; + + if ($this->flag_map !== null && !empty($this->flag_map)) { + $max_flag_length = \array_reduce($this->flag_map, fn($max, $str) => \max($max, \strlen($str)), 0); + + list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->flag); + + // Add 2 to account for possible wildcards on the ends. + if ($total_len <= $max_flag_length + 2) { + $weighted->flag = $fragments; + $weighted->weight += $wildcard_weight; + } + } + } + $weighted->id = $filters->id; + $weighted->thread = $filters->thread; + if (!empty($filters->body)) { + foreach ($filters->body as $keyword) { + list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($keyword); + $content_weight = self::weightByContent($fragments); + $str_weight = $content_weight + $wildcard_weight; + + if ($str_weight + $weighted->weight <= $this->max_weight) { + $weighted->weight += $str_weight; + $weighted->body[] = $fragments; + } + } + } + + return $weighted; + } + + /** + * Run a search on user posts with the given filters. + * + * @param SearchFilters $filters An array of filters made by {@see self::parse()}. + * @param ?string $fallback_board Fallback board if there isn't a board filter. + * @return array Data array straight from the PDO, with all the fields in posts.sql + */ + public function search(string $ip, string $raw_query, SearchFilters $filters, ?string $fallback_board): array { + $board = $filters->board ?? $fallback_board; + if ($board === null) { + return []; + } + + $valid_uris = listBoards(true); + if (!\in_array($board, $valid_uris)) { + return []; + } + + $weight_perc = ($filters->weight / $this->max_weight) * 100; + if ($weight_perc > 85) { + /// Over 85 of the weight. + $this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'"); + } else { + $this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'"); + } + + $flags = []; + if ($filters->flag !== null && $this->flag_map !== null) { + $flags = $this->matchStrings($this->flag_map, $filters->flag); + if (empty($flags)) { + // The query doesn't match any flags so it will always fail anyway. + return []; + } + } + + return $this->user_queries->searchPosts( + $board, + $filters->subject, + $filters->name, + $flags, + $filters->id, + $filters->thread, + $filters->body, + $this->post_limit + ); + } +}