Merge pull request 'Refactor the search subsystem' (#127) from rework-search into config

Reviewed-on: leftypol/leftypol#127
This commit is contained in:
Zankaria 2025-07-08 13:59:52 -05:00
commit 6e9d0a4e77
13 changed files with 1093 additions and 173 deletions

View file

@ -1,5 +1,7 @@
[www]
access.log = /proc/self/fd/2
php_admin_value[error_log] = /proc/self/fd/2
php_admin_flag[log_errors] = on
; Ensure worker stdout and stderr are sent to the main error log.
catch_workers_output = yes

View file

@ -0,0 +1,13 @@
<?php
namespace Vichan\Data;
class FiltersParseResult {
public array $body = [];
public ?string $subject = null;
public ?string $name = null;
public ?string $board = null;
public ?string $flag = null;
public ?int $id = null;
public ?int $thread = null;
}

285
inc/Data/Flags.php Normal file
View file

@ -0,0 +1,285 @@
<?php
namespace Vichan\Data;
class Flags {
/**
* Short names of the flags embedded with vichan.
*/
public const EMBEDDED_FLAGS = [
'a1',
'a2',
'ac',
'ad',
'ae',
'af',
'ag',
'ai',
'al',
'am',
'an',
'ao',
'ap',
'aq',
'ar',
'as',
'at',
'au',
'aw',
'ax',
'az',
'ba',
'bb',
'bd',
'be',
'bf',
'bg',
'bh',
'bi',
'bj',
'bl',
'bm',
'bn',
'bo',
'bq',
'br',
'bs',
'bt',
'bu',
'bv',
'bw',
'by',
'bz',
'ca',
'cat',
'cc',
'cd',
'cf',
'cg',
'ch',
'ci',
'ck',
'cl',
'cm',
'cn',
'co',
'cp',
'cr',
'cs',
'cu',
'cv',
'cw',
'cx',
'cy',
'cz',
'de',
'dg',
'dj',
'dk',
'dm',
'do',
'dz',
'ea',
'ec',
'ee',
'eg',
'eh',
'er',
'es',
'et',
'eu',
'fi',
'fj',
'fk',
'fm',
'fo',
'fr',
'fx',
'ga',
'gb',
'gd',
'ge',
'gf',
'gg',
'gh',
'gi',
'gl',
'gm',
'gn',
'gp',
'gq',
'gr',
'gs',
'gt',
'gu',
'gw',
'gy',
'hk',
'hm',
'hn',
'hr',
'ht',
'hu',
'ic',
'id',
'ie',
'il',
'im',
'in',
'io',
'iq',
'ir',
'is',
'it',
'je',
'jm',
'jo',
'jp',
'ke',
'kg',
'kh',
'ki',
'km',
'kn',
'kp',
'kr',
'kw',
'ky',
'kz',
'la',
'lb',
'lc',
'li',
'lk',
'lr',
'ls',
'lt',
'lu',
'lv',
'ly',
'ma',
'mc',
'md',
'me',
'mf',
'mg',
'mh',
'mk',
'ml',
'mm',
'mn',
'mo',
'mp',
'mq',
'mr',
'ms',
'mt',
'mu',
'mv',
'mw',
'mx',
'my',
'mz',
'na',
'nc',
'ne',
'nf',
'ng',
'ni',
'nl',
'no',
'np',
'nr',
'nt',
'nu',
'nz',
'o1',
'om',
'pa',
'pe',
'pf',
'pg',
'ph',
'pk',
'pl',
'pm',
'pn',
'pr',
'ps',
'pt',
'pw',
'py',
'qa',
're',
'ro',
'rs',
'ru',
'rw',
'sa',
'sb',
'sc',
'sd',
'se',
'sf',
'sg',
'sh',
'si',
'sj',
'sk',
'sl',
'sm',
'sn',
'so',
'sr',
'ss',
'st',
'su',
'sv',
'sx',
'sy',
'sz',
'ta',
'tc',
'td',
'tf',
'tg',
'th',
'ti',
'tj',
'tk',
'tl',
'tm',
'tn',
'to',
'tp',
'tr',
'tt',
'tv',
'tw',
'tz',
'ua',
'ug',
'uk',
'um',
'us',
'uy',
'uz',
'va',
'vc',
've',
'vg',
'vi',
'vn',
'vu',
'wf',
'ws',
'xx',
'ye',
'yt',
'yu',
'za',
'zm',
'zr',
'zw',
];
}

View file

@ -0,0 +1,32 @@
<?php
namespace Vichan\Data;
/**
* POD with the fragments of each filter.
*/
class SearchFilters {
/**
* @var array<array<string>>
*/
public array $body = [];
/**
* @var array<string>
*/
public array $subject = [];
/**
* @var array<string>
*/
public array $name = [];
/**
* @var ?string
*/
public ?string $board = null;
/**
* @var array<string>
*/
public array $flag = [];
public ?int $id = null;
public ?int $thread = null;
public float $weight = 0;
}

View file

@ -0,0 +1,98 @@
<?php
namespace Vichan\Data;
/**
* Implements flood control for search queries.
*/
class SearchQueries {
private \PDO $pdo;
private int $queries_for_single;
private int $range_for_single;
private int $queries_for_all;
private int $range_for_all;
private bool $auto_gc;
private function checkFloodImpl(string $ip, string $phrase): bool {
$now = \time();
$query = $this->pdo->prepare("SELECT COUNT(2) FROM `search_queries` WHERE `ip` = :ip AND `time` > :time");
$query->bindValue(':ip', $ip);
$query->bindValue(':time', $now - $this->range_for_single, \PDO::PARAM_INT);
$query->execute();
if ($query->fetchColumn() > $this->queries_for_single) {
return true;
}
$query = $this->pdo->prepare("SELECT COUNT(2) FROM `search_queries` WHERE `time` > :time");
$query->bindValue(':time', $now - $this->range_for_all, \PDO::PARAM_INT);
$query->execute();
if ($query->fetchColumn() > $this->queries_for_all) {
return true;
}
$query = $this->pdo->prepare("INSERT INTO `search_queries` VALUES (:ip, :time, :query)");
$query->bindValue(':ip', $ip);
$query->bindValue(':time', $now, \PDO::PARAM_INT);
$query->bindValue(':query', $phrase);
$query->execute();
if ($this->auto_gc) {
$this->purgeExpired();
}
return false;
}
/**
* @param \PDO $pdo PDO to access the DB.
* @param int $queries_for_single Maximum number of queries for a single IP, in seconds.
* @param int $range_for_single Maximum age of the oldest query to consider from a single IP.
* @param int $queries_for_all Maximum number of queries for all IPs.
* @param int $range_for_all Maximum age of the oldest query to consider from all IPs, in seconds.
* @param bool $auto_gc If to run the cleanup at every check. Must be invoked from the outside otherwise.
*/
public function __construct(
\PDO $pdo,
int $queries_for_single,
int $range_for_single,
int $queries_for_all,
int $range_for_all,
bool $auto_gc
) {
$this->pdo = $pdo;
$this->queries_for_single = $queries_for_single;
$this->range_for_single = $range_for_single;
$this->queries_for_all = $queries_for_all;
$this->range_for_all = $range_for_all;
$this->auto_gc = $auto_gc;
}
/**
* Check if the IP-query pair overflows the limit.
*
* @param string $ip Source IP.
* @param string $phrase The search query.
* @return bool True if the request goes over the limit.
*/
public function checkFlood(string $ip, string $phrase): bool {
$this->pdo->beginTransaction();
try {
$ret = $this->checkFloodImpl($ip, $phrase);
$this->pdo->commit();
return $ret;
} catch (\Exception $e) {
$this->pdo->rollBack();
throw $e;
}
}
public function purgeExpired(): int {
// Cleanup search queries table.
$query = $this->pdo->prepare("DELETE FROM `search_queries` WHERE `time` <= :expiry_limit");
$query->bindValue(':expiry_limit', \time() - $this->range_for_all, \PDO::PARAM_INT);
$query->execute();
return $query->rowCount();
}
}

View file

@ -13,6 +13,36 @@ class UserPostQueries {
private \PDO $pdo;
/**
* Escapes wildcards from LIKE operators using the default escape character.
*/
private static function escapeLike(string $str): string {
// Escape any existing escape characters.
$str = \str_replace('\\', '\\\\', $str);
// Escape wildcard characters.
$str = \str_replace('%', '\\%', $str);
$str = \str_replace('_', '\\_', $str);
return $str;
}
/**
* Joins the fragments of filter into a list of bindable parameters for the CONCAT sql function.
* Given prefix = cat and fragments_count = 3, we get [ "'%'", ":cat0%", "'%', ":cat1", "'%'" ":cat2%", "'%'" ];
*
* @param string $prefix The prefix for the parameter binding
* @param int $fragments_count MUST BE >= 1.
* @return array
*/
private static function arrayOfFragments(string $prefix, int $fragments_count): array {
$args = [ "'%'" ];
for ($i = 0; $i < $fragments_count; $i++) {
$args[] = ":$prefix$i";
$args[] = "'%'";
}
return $args;
}
public function __construct(\PDO $pdo) {
$this->pdo = $pdo;
}
@ -156,4 +186,89 @@ class UserPostQueries {
}
});
}
/**
* Search among the user posts with the given filters.
* The subject, name and elements of the bodies filters are fragments which are joined together with wildcards, to
* allow for more flexible filtering.
*
* @param string $board The board where to search in.
* @param array<string> $subject Fragments of the subject filter.
* @param array<string> $name Fragments of the name filter.
* @param array<string> $flags An array of the flag names to search among the HTML.
* @param ?int $id Post id filter.
* @param ?int $thread Thread id filter.
* @param array<array<string>> $bodies An array whose element are arrays containing the fragments of multiple body filters, each
* searched independently from the others
* @param integer $limit The maximum number of results.
* @throws PDOException On error.
* @return array<array>
*/
public function searchPosts(string $board, array $subject, array $name, array $flags, ?int $id, ?int $thread, array $bodies, int $limit): array {
$where_acc = [];
if (!empty($subject)) {
$like_arg = self::arrayOfFragments('subj', \count($subject));
$where_acc[] = 'subject LIKE CONCAT(' . \implode(', ', $like_arg) . ')';
}
if (!empty($name)) {
$like_arg = self::arrayOfFragments('name', \count($name));
$where_acc[] = 'name LIKE CONCAT(' . \implode(', ', $like_arg) . ')';
}
if (!empty($flags)) {
$flag_acc = [];
for ($i = 0; $i < \count($flags); $i++) {
// Yes, vichan stores the flag inside the generated HTML. Now you know why it's slow as shit.
// English lacks the words to express my feelings about it in a satisfying manner.
$flag_acc[] = "CONCAT('%<tinyboard>', :flag$i, '</tinyboard>%')";
}
$where_acc[] = 'body_nomarkup LIKE (' . \implode(' OR ', $flag_acc) . ')';
}
if ($id !== null) {
$where_acc[] = 'id = :id';
}
if ($thread !== null) {
$where_acc[] = 'thread = :thread';
}
for ($i = 0; $i < \count($bodies); $i++) {
$body = $bodies[$i];
$like_arg = self::arrayOfFragments("body_{$i}_", \count($body));
$where_acc[] = 'body_nomarkup LIKE CONCAT(' . \implode(', ', $like_arg) . ')';
}
if (empty($where_acc)) {
return [];
}
$sql = "SELECT * FROM `posts_$board` WHERE " . \implode(' AND ', $where_acc) . ' LIMIT :limit';
$query = $this->pdo->prepare($sql);
for ($i = 0; $i < \count($subject); $i++) {
$query->bindValue(":subj$i", self::escapeLike($subject[$i]));
}
for ($i = 0; $i < \count($name); $i++) {
$query->bindValue(":name$i", self::escapeLike($name[$i]));
}
for ($i = 0; $i < \count($flags); $i++) {
$query->bindValue(":flag$i", self::escapeLike($flags[$i]));
}
if ($id !== null) {
$query->bindValue(':id', $id, \PDO::PARAM_INT);
}
if ($thread !== null) {
$query->bindValue(':thread', $thread, \PDO::PARAM_INT);
}
for ($body_i = 0; $body_i < \count($bodies); $body_i++) {
$body = $bodies[$body_i];
for ($i = 0; $i < \count($body); $i++) {
$query->bindValue(":body_{$body_i}_{$i}", self::escapeLike($body[$i]));
}
}
$query->bindValue(':limit', $limit, \PDO::PARAM_INT);
$query->execute();
return $query->fetchAll(\PDO::FETCH_ASSOC);
}
}

View file

@ -0,0 +1,417 @@
<?php
namespace Vichan\Service;
use Vichan\Data\Driver\LogDriver;
use Vichan\Data\{FiltersParseResult, UserPostQueries, SearchFilters, SearchQueries};
class SearchService {
private const COMMON_WORDS = [
'anon', 'thread', 'board', 'post', 'reply', 'image', 'topic', 'bump', 'sage', 'tripcode', 'groyper',
'mod', 'admin', 'ban', 'rules', 'sticky', 'archive', 'catalog', 'report', 'captcha', 'proxy', 'the',
'vpn', 'tor', 'doxx', 'spam', 'troll', 'bait', 'flame', 'greentext', 'copypasta', 'meme', 'this',
'shitpost', 'shitposting', 'edgy', 'kek', 'lulz', 'rekt', 'smug', 'lewd', 'nsfw', 'anonymous', 'glowie',
'cringe', 'normie', 'boomer', 'zoomer', 'incel', 'chad', 'stacy', 'simp', 'based', 'redpill', 'color',
'blackpill', 'whitepill', 'bluepill', 'clownworld', 'coomer', 'doomer', 'wojak', 'soyjak', 'pepe',
'style', 'weight', 'size', 'freedom', 'speech', 'censorship', 'moderation', 'community', 'anonymous',
'reply', 'search', 'group', 'merge', 'flatten', 'lock', 'unlock', 'hide', 'uyghur', 'soyshit', 'glow',
'also', 'only', 'just', 'even', 'very', 'than', 'then', 'that', 'this', 'with',
'from', 'into', 'onto', 'over', 'under', 'about', 'after', 'before', 'since', 'while',
'because', 'although', 'though', 'unless', 'until', 'where', 'which', 'whose', 'there', 'their',
'these', 'those', 'being', 'having', 'doing', 'going', 'would', 'could', 'should', 'shall', 'everything',
'might', 'must', 'will', 'have', 'been', 'were', 'wasn', 'aren', 'isn', 'does', 'isnt', 'mustnt',
'didn', 'hadn', 'hasn', 'dont', 'cant', 'wont', 'cannot', 'haven', 'weren', 'didnt', 'since',
'mustn', 'mightn', 'shouldn', 'wouldn', 'mightve', 'wouldve', 'shouldve', 'couldve', 'mustve',
'wasnt', 'werent', 'hasnt', 'hadnt', 'wont', 'wouldnt', 'shouldnt', 'couldnt', 'mightnt',
'each', 'such', 'some', 'most', 'many', 'more', 'much', 'less', 'few', 'none', 'although', 'because',
'both', 'either', 'neither', 'every', 'anyone', 'someone', 'everyone', 'nobody', 'nothing', 'so',
'above', 'below', 'along', 'across', 'among', 'until', 'and', 'but', 'or', 'nor', 'for', 'yet',
];
private const MAX_LENGTH_SUBJECT = 100; // posts.sql
private const MAX_LENGTH_NAME = 35; // posts.sql
private LogDriver $log;
private UserPostQueries $user_queries;
private SearchQueries $search_queries;
private ?array $flag_map;
private float $max_weight;
private int $max_query_length;
private int $post_limit;
private array $searchable_board_uris;
private static function truncateQuery(string $text, int $byteLimit): ?string {
if (\strlen($text) <= $byteLimit) {
return $text;
}
// Cut at byte length, trimming incomplete multibyte character at the end.
$cut = \mb_convert_encoding(\substr($text, 0, $byteLimit), 'UTF-8', 'UTF-8');
// Try the last space.
$spacePos = \strrpos($cut, ' ');
if ($spacePos !== false) {
return \substr($cut, 0, $spacePos);
}
// Fallback to the last word boundary.
if (\preg_match('/^(.+)\b/u', $cut, $m)) {
return $m[1];
}
// Too long but could not cut.
return null;
}
private static function trim(string $str): string {
return \trim($str, "* \n\r\t\v\0");
}
private static function unescape(string $str): string {
return \strtr($str, [
'\\\\' => '\\',
'\\*' => '*',
'\\"' => '"'
]);
}
/**
* Split the filter into fragments along the wildcards, handling escaping.
*
* @param string $str The full filter.
* @return array<string>
*/
private static function split(string $str): array {
// Split the fragments
return \preg_split('/(?:\\\\\\\\)*\\\\\*|(?:\\\\\\\\)*\*+/', $str);
}
private static function weightByContent(array $fragments): float {
$w = 0;
foreach ($fragments as $fragment) {
$short = \strlen($fragment) < 4;
if (\in_array($fragment, self::COMMON_WORDS)) {
$w += $short ? 16 : 6;
} elseif ($short) {
$w += 6;
}
}
return $w;
}
private static function filterAndWeight(string $filter): array {
$fragments = self::split($filter);
$acc = [];
$total_len = 0;
foreach ($fragments as $fragment) {
$fragment = self::trim(self::unescape($fragment));
if (!empty($fragment)) {
$total_len += \strlen($fragment);
$acc[] = $fragment;
}
}
// Interword wildcards
$interword = \min(\count($fragments) - 1, 0);
// Wildcards over the total length of the word. Ergo the number of fragments minus 1.
$perc = $interword / $total_len * 100;
$wildcard_weight = $perc + \count($fragments) * 2;
return [ $acc, $total_len, $wildcard_weight ];
}
/**
* Gets a subset of the given strings which match every filter.
*
* @param array<string> $fragments User provided fragments to search in the flags.
* @param array<string> $strings An array of strings.
* @return array<string> An array of strings, subset of $strings.
*/
private static function matchStrings(array $strings, array $fragments): array {
return \array_filter($strings, function ($str) use ($fragments) {
// Saves the last position. We use this to ensure the fragments are one after the other.
$last_ret = 0;
foreach ($fragments as $fragment) {
if ($last_ret + 1 > \strlen($fragment)) {
// Cannot possibly match.
return false;
}
$last_ret = \stripos($str, $fragment, $last_ret + 1);
if ($last_ret === false) {
// Exclude flags that don't much even a single fragment.
return false;
}
}
return true;
});
}
/**
* Parses a raw search query.
*
* @param string $raw_query Raw user query. Phrases are searched in the post bodies. The user can specify also
* additional filters in the <key>:<value> format.
* Available filters:
* - board: the board, value can be quoted
* - subject: post subject, value can be quoted, supports wildcards
* - name: post name, value can be quoted, supports wildcards
* - flag: post flag, value can be quoted, supports wildcards
* - id: post id, must be numeric
* - thread: thread id, must be numeric
* The remaining text is split into chunks and searched in the post body.
* @return FiltersParseResult
*/
public function parse(string $raw_query): FiltersParseResult{
$tres = self::truncateQuery($raw_query, $this->max_query_length);
if ($tres === null) {
throw new \RuntimeException('Could not truncate query');
}
$pres = \preg_match_all(
'/(?:
\b(board):
(?:
"([^"]+)" # [2] board: "quoted"
|
([^\s"]+) # [3] board: unquoted
)
|
\b(subject|name|flag):
(?:
"((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [5] quoted with wildcards
|
((?:\\\\\\\\|\\\\\*|[^\s\\\\])++) # [6] unquoted with wildcards
)
|
\b(id|thread):
(\d+) # [8] numeric only
|
"((?:\\\\\\\\|\\\\\"|\\\\\*|[^"\\\\])*)" # [9] quoted free text
|
([^"\s]++) # [10] unquoted free text block
)/iux',
$tres,
$matches,
\PREG_SET_ORDER
);
if ($pres === false) {
throw new \RuntimeException('Could not decode the query');
}
$filters = new FiltersParseResult();
foreach ($matches as $m) {
if (!empty($m[1])) {
// board (no wildcards).
$value = \trim(!empty($m[2]) ? $m[2] : $m[3], '/');
$filters->board = $value;
} elseif (!empty($m[4])) {
// subject, name, flag (with wildcards).
$key = \strtolower($m[4]);
$value = !empty($m[5]) ? $m[5] : $m[6];
if ($key === 'name') {
$filters->name = $value;
} elseif ($key === 'subject') {
$filters->subject = $value;
} else {
$filters->flag = $value;
}
} elseif (!empty($m[7])) {
$key = \strtolower($m[7]);
$value = (int)$m[8];
if ($key === 'id') {
$filters->id = $value;
} else {
$filters->thread = $value;
}
} elseif (!empty($m[9]) || !empty($m[10])) {
$value = !empty($m[9]) ? $m[9] : $m[10];
$filters->body[] = $value;
}
}
return $filters;
}
/**
* @param LogDriver $log Log river.
* @param UserPostQueries $user_queries User posts queries.
* @param SearchQueries $search_queries Search queries for flood detection.
* @param ?array $flag_map The key-value map of user flags, or null to disable flag search.
* @param float $max_weight The maximum weight of the parsed user query. Body filters that go beyond this limit are discarded.
* @param int $max_query_length Maximum length of the raw input query before it's truncated.
* @param int $post_limit Maximum number of results.
* @param ?array $searchable_board_uris The uris of the board that can be searched. Null to search all the boards.
*/
public function __construct(
LogDriver $log,
UserPostQueries $user_queries,
SearchQueries $search_queries,
?array $flag_map,
float $max_weight,
int $max_query_length,
int $post_limit,
?array $searchable_board_uris
) {
$this->log = $log;
$this->user_queries = $user_queries;
$this->search_queries = $search_queries;
$this->flag_map = $flag_map;
$this->max_weight = $max_weight;
$this->max_query_length = $max_query_length;
$this->post_limit = $post_limit;
$this->searchable_board_uris = $searchable_board_uris ?? listBoards(true);
}
/**
* Reduces the user provided filters and assigns them a total weight.
*
* @param FiltersParseResult $filters The filters to sanitize, reduce and weight.
* @return SearchFilters
*/
public function reduceAndWeight(FiltersParseResult $filters): SearchFilters {
$weighted = new SearchFilters();
if ($filters->subject !== null) {
list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->subject);
if ($total_len <= self::MAX_LENGTH_SUBJECT) {
$weighted->subject = $fragments;
$weighted->weight = $wildcard_weight;
}
}
if ($filters->name !== null) {
list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->name);
if ($total_len <= self::MAX_LENGTH_NAME) {
$weighted->name = $fragments;
$weighted->weight += $wildcard_weight;
}
}
// No wildcard support, and obligatory anyway so it weights 0.
$weighted->board = $filters->board;
if ($filters->flag !== null) {
$weighted->flag = [];
if (!empty($this->flag_map)) {
$max_flag_length = \array_reduce($this->flag_map, fn($max, $str) => \max($max, \strlen($str)), 0);
list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($filters->flag);
// Add 2 to account for possible wildcards on the ends.
if ($total_len <= $max_flag_length + 2) {
$weighted->flag = $fragments;
$weighted->weight += $wildcard_weight;
}
}
}
$weighted->id = $filters->id;
$weighted->thread = $filters->thread;
if (!empty($filters->body)) {
foreach ($filters->body as $keyword) {
list($fragments, $total_len, $wildcard_weight) = self::filterAndWeight($keyword);
$content_weight = self::weightByContent($fragments);
$str_weight = $content_weight + $wildcard_weight;
if ($str_weight + $weighted->weight <= $this->max_weight) {
$weighted->weight += $str_weight;
$weighted->body[] = $fragments;
}
}
}
return $weighted;
}
/**
* Run a search on user posts with the given filters.
*
* @param SearchFilters $filters An array of filters made by {@see self::parse()}.
* @param ?string $fallback_board Fallback board if there isn't a board filter.
* @return ?array Data array straight from the PDO, with all the fields in posts.sql, or null if the query was too broad.
*/
public function search(string $ip, string $raw_query, SearchFilters $filters, ?string $fallback_board): ?array {
$board = !empty($filters->board) ? $filters->board : $fallback_board;
if ($board === null) {
return [];
}
// Only board is specified.
if (empty($filters->subject) &&
empty($filters->name) &&
empty($filters->flag) &&
$filters->id === null &&
$filters->thread === null &&
empty($filters->body)
) {
return null;
}
if (!\in_array($board, $this->searchable_board_uris)) {
return [];
}
$weight_perc = ($filters->weight / $this->max_weight) * 100;
if ($weight_perc > 85) {
/// Over 85 of the weight.
$this->log->log(LogDriver::NOTICE, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
} else {
$this->log->log(LogDriver::INFO, "$ip search: weight $weight_perc ({$filters->weight}) query '$raw_query'");
}
$flags = [];
if ($filters->flag !== null && !empty($this->flag_map)) {
$flags = $this->matchStrings($this->flag_map, $filters->flag);
if (empty($flags)) {
// The query doesn't match any flags so it will always fail anyway.
return [];
}
}
return $this->user_queries->searchPosts(
$board,
$filters->subject,
$filters->name,
$flags,
$filters->id,
$filters->thread,
$filters->body,
$this->post_limit
);
}
/**
* Check if the IP-query pair passes the limit.
*
* @param string $ip Source IP.
* @param string $phrase The search query.
* @return bool True if the request goes over the limit.
*/
public function checkFlood(string $ip, string $raw_query) {
return $this->search_queries->checkFlood($ip, $raw_query);
}
/**
* Returns the uris of the boards that may be searched.
*/
public function getSearchableBoards(): array {
return $this->searchable_board_uris;
}
/**
* @return bool True if the flag filter is enabled.
*/
public function isFlagFilterEnabled(): bool {
return !empty($this->flag_map);
}
}

View file

@ -1856,7 +1856,15 @@
// Limit of search results
$config['search']['search_limit'] = 100;
// Boards for searching
// Maximum weigth of the search query.
// Body search filters are discarded if they make the query heavier than this.
$config['search']['max_weight'] = 80;
// Maximum length of the user sent search query.
// Characters beyond the limit are truncated and ignored.
$config['search']['max_length'] = 768;
// Uncomment to limit the search feature to the given boards by uri.
//$config['search']['boards'] = array('a', 'b', 'c', 'd', 'e');
// Enable public logs? 0: NO, 1: YES, 2: YES, but drop names

View file

@ -1,8 +1,9 @@
<?php
namespace Vichan;
use Vichan\Data\{IpNoteQueries, ReportQueries, UserPostQueries};
use Vichan\Data\{IpNoteQueries, ReportQueries, SearchQueries, UserPostQueries, Flags};
use Vichan\Data\Driver\{CacheDriver, ErrorLogLogDriver, FileLogDriver, LogDriver, StderrLogDriver, SyslogLogDriver};
use Vichan\Service\SearchService;
defined('TINYBOARD') or exit;
@ -69,6 +70,29 @@ function build_context(array $config): Context {
sql_open();
return $pdo;
},
SearchService::class => function($c) {
$config = $c->get('config');
if ($config['user_flag']) {
$flags = $config['user_flags'];
} elseif ($config['country_flags']) {
$flags = Flags::EMBEDDED_FLAGS;
} else {
$flags = null;
}
$board_uris = $config['search']['boards'] ?? null;
return new SearchService(
$c->get(LogDriver::class),
$c->get(UserPostQueries::class),
$c->get(SearchQueries::class),
$flags,
$config['search']['max_weight'],
$config['search']['max_length'],
$config['search']['search_limit'],
$board_uris
);
},
ReportQueries::class => function($c) {
$auto_maintenance = (bool)$c->get('config')['auto_maintenance'];
$pdo = $c->get(\PDO::class);
@ -78,5 +102,19 @@ function build_context(array $config): Context {
return new UserPostQueries($c->get(\PDO::class));
},
IpNoteQueries::class => fn($c) => new IpNoteQueries($c->get(\PDO::class), $c->get(CacheDriver::class)),
SearchQueries::class => function($c) {
$config = $c->get('config');
list($queries_for_single, $range_for_single_min) = $config['search']['queries_per_minutes'];
list($queries_for_all, $range_for_all_min) = $config['search']['queries_per_minutes_all'];
return new SearchQueries(
$c->get(\PDO::class),
$queries_for_single,
$range_for_single_min * 60,
$queries_for_all,
$range_for_all_min * 60,
(bool)$config['auto_maintenance']
);
}
]);
}

View file

@ -1,178 +1,77 @@
<?php
use Vichan\Service\SearchService;
require 'inc/bootstrap.php';
if (!$config['search']['enable']) {
die(_("Post search is disabled"));
}
$queries_per_minutes = $config['search']['queries_per_minutes'];
$queries_per_minutes_all = $config['search']['queries_per_minutes_all'];
$search_limit = $config['search']['search_limit'];
$ctx = Vichan\build_context($config);
$search_service = $ctx->get(SearchService::class);
if (isset($config['search']['boards'])) {
$boards = $config['search']['boards'];
} else {
$boards = listBoards(TRUE);
}
if (isset($_GET['search']) && !empty($_GET['search'])) {
$raw_search = $_GET['search'];
$ip = $_SERVER['REMOTE_ADDR'];
$fallback_board = (isset($_GET['board']) && !empty($_GET['board'])) ? $_GET['board'] : null;
$body = Element('search_form.html', Array('boards' => $boards, 'board' => isset($_GET['board']) ? $_GET['board'] : false, 'search' => isset($_GET['search']) ? str_replace('"', '&quot;', utf8tohtml($_GET['search'])) : false));
if (isset($_GET['search']) && !empty($_GET['search']) && isset($_GET['board']) && in_array($_GET['board'], $boards)) {
$phrase = $_GET['search'];
$_body = '';
$query = prepare("SELECT COUNT(*) FROM ``search_queries`` WHERE `ip` = :ip AND `time` > :time");
$query->bindValue(':ip', $_SERVER['REMOTE_ADDR']);
$query->bindValue(':time', time() - ($queries_per_minutes[1] * 60));
$query->execute() or error(db_error($query));
if ($query->fetchColumn() > $queries_per_minutes[0])
if ($search_service->checkFlood($ip, $raw_search)) {
error(_('Wait a while before searching again, please.'));
$query = prepare("SELECT COUNT(*) FROM ``search_queries`` WHERE `time` > :time");
$query->bindValue(':time', time() - ($queries_per_minutes_all[1] * 60));
$query->execute() or error(db_error($query));
if ($query->fetchColumn() > $queries_per_minutes_all[0])
error(_('Wait a while before searching again, please.'));
$query = prepare("INSERT INTO ``search_queries`` VALUES (:ip, :time, :query)");
$query->bindValue(':ip', $_SERVER['REMOTE_ADDR']);
$query->bindValue(':time', time());
$query->bindValue(':query', $phrase);
$query->execute() or error(db_error($query));
_syslog(LOG_NOTICE, 'Searched /' . $_GET['board'] . '/ for "' . $phrase . '"');
// Cleanup search queries table
$query = prepare("DELETE FROM ``search_queries`` WHERE `time` <= :time");
$query->bindValue(':time', time() - ($queries_per_minutes_all[1] * 60));
$query->execute() or error(db_error($query));
openBoard($_GET['board']);
$filters = Array();
function search_filters($m) {
global $filters;
$name = $m[2];
$value = isset($m[4]) ? $m[4] : $m[3];
if (!in_array($name, array('id', 'thread', 'subject', 'name'))) {
// unknown filter
return $m[0];
}
$filters[$name] = $value;
// Actually do the search.
$parse_res = $search_service->parse($raw_search);
$filters = $search_service->reduceAndWeight($parse_res);
$search_res = $search_service->search($ip, $raw_search, $filters, $fallback_board);
return $m[1];
}
// Needed to set a global variable further down the stack, plus the template.
$actual_board = $filters->board ?? $fallback_board;
$phrase = trim(preg_replace_callback('/(^|\s)(\w+):("(.*)?"|[^\s]*)/', 'search_filters', $phrase));
$body = Element('search_form.html', [
'boards' => $search_service->getSearchableBoards(),
'board' => $actual_board,
'search' => \str_replace('"', '&quot;', utf8tohtml($_GET['search'])),
'flags_enabled' => $search_service->isFlagFilterEnabled()
]);
if (!preg_match('/[^*^\s]/', $phrase) && empty($filters)) {
_syslog(LOG_WARNING, 'Query too broad.');
$body .= '<p class="unimportant" style="text-align:center">(Query too broad.)</p>';
echo Element('page.html', Array(
'config'=>$config,
'title'=>'Search',
'body'=>$body,
));
exit;
}
if ($search_res === null) {
$body .= '<hr/><p style="text-align:center" class="unimportant">(' . _('Query too broad.') . ')</p>';
} elseif (empty($search_res)) {
$body .= '<hr/><p style="text-align:center" class="unimportant">(' . _('No results.') . ')</p>';
} else {
$body .= '<hr/>';
// Escape escape character
$phrase = str_replace('!', '!!', $phrase);
openBoard($actual_board);
// Remove SQL wildcard
$phrase = str_replace('%', '!%', $phrase);
// Use asterisk as wildcard to suit convention
$phrase = str_replace('*', '%', $phrase);
// Remove `, it's used by table prefix magic
$phrase = str_replace('`', '!`', $phrase);
$like = '';
$match = Array();
// Find exact phrases
if (preg_match_all('/"(.+?)"/', $phrase, $m)) {
foreach($m[1] as &$quote) {
$phrase = str_replace("\"{$quote}\"", '', $phrase);
$match[] = $pdo->quote($quote);
}
}
$words = explode(' ', $phrase);
foreach($words as &$word) {
if (empty($word)) {
continue;
}
$match[] = $pdo->quote($word);
}
$like = '';
foreach($match as &$phrase) {
if (!empty($like)) {
$like .= ' AND ';
}
$phrase = preg_replace('/^\'(.+)\'$/', '\'%$1%\'', $phrase);
$like .= '`body` LIKE ' . $phrase . ' ESCAPE \'!\'';
}
foreach($filters as $name => $value) {
if (!empty($like)) {
$like .= ' AND ';
}
$like .= '`' . $name . '` = '. $pdo->quote($value);
}
$like = str_replace('%', '%%', $like);
$query = prepare(sprintf("SELECT * FROM ``posts_%s`` WHERE " . $like . " ORDER BY `time` DESC LIMIT :limit", $board['uri']));
$query->bindValue(':limit', $search_limit, PDO::PARAM_INT);
$query->execute() or error(db_error($query));
if ($query->rowCount() == $search_limit) {
_syslog(LOG_WARNING, 'Query too broad.');
$body .= '<p class="unimportant" style="text-align:center">('._('Query too broad.').')</p>';
echo Element('page.html', Array(
'config'=>$config,
'title'=>'Search',
'body'=>$body,
));
exit;
}
$temp = '';
while ($post = $query->fetch()) {
$posts_html = '';
foreach ($search_res as $post) {
if (!$post['thread']) {
$po = new Thread($post);
} else {
$po = new Post($post);
}
$temp .= $po->build(true) . '<hr/>';
$posts_html .= $po->build(true) . '<hr/>';
}
if (!empty($temp))
$_body .= '<fieldset><legend>' .
sprintf(ngettext('%d result in', '%d results in', $query->rowCount()),
$query->rowCount()) . ' <a href="/' .
sprintf($config['board_path'], $board['uri']) . $config['file_index'] .
'">' .
$body .= '<fieldset><legend>' .
sprintf(ngettext('%d result in', '%d results in', \count($search_res)), \count($search_res)) . ' <a href="/' .
sprintf($config['board_path'], $board['uri']) . $config['file_index'] . '">' .
sprintf($config['board_abbreviation'], $board['uri']) . ' - ' . $board['title'] .
'</a></legend>' . $temp . '</fieldset>';
$body .= '<hr/>';
if (!empty($_body)) {
$body .= $_body;
} else {
$body .= '<p style="text-align:center" class="unimportant">('._('No results.').')</p>';
'</a></legend>' . $posts_html . '</fieldset>';
}
} else {
$body = Element('search_form.html', [
'boards' => $search_service->getSearchableBoards(),
'board' => false,
'search' => false,
'flags_enabled' => $search_service->isFlagFilterEnabled()
]);
}
echo Element('page.html', Array(
'config'=>$config,
'title'=>_('Search'),
'body'=>'' . $body
'title'=> _('Search'),
'body'=> $body
));

View file

@ -753,10 +753,6 @@ table.test td img {
margin: 0;
}
fieldset label {
display: block;
}
div.pages {
/*! color: #89A; */
/*! background: #D6DAF0; */

View file

@ -1,9 +1,14 @@
<div class="ban">
<style>
form > p {
align-content: center;
}
</style>
<h2>{% trans %}Search{% endtrans %}</h2>
<form style="display:inline" action="" method="get">
<form style="width:100%;display:flex;flex-wrap:wrap;justify-content:space-between;text-align:start" action="" method="get">
<p style="padding-right:0"><label for="search">{% trans %}Phrase:{% endtrans %}</label></p>
<p style="flex-grow:1"><input style="width:100%" id="search" name="search" type="text" size="40" value="{{ search }}"></p>
<p>
<label style="display:inline" for="search">{% trans %}Phrase:{% endtrans %}</label>
<input id="search" name="search" type="text" size="40" value="{{ search }}">
<select name="board">
<option value="none">{% trans %}Select board{% endtrans %}&hellip;</option>
@ -19,6 +24,10 @@
</p>
</form>
<p style="font-size:8pt;margin:5px">
{% trans %}Search is case-insensitive and based on keywords. To match exact phrases, use "quotes". Use an asterisk (*) for wildcard.</p><p style="font-size:8pt;margin:5px">You may apply the following filters to your searches: <strong>id</strong>, <strong>thread</strong>, <strong>subject</strong>, and <strong>name</strong>. To apply a filter, simply add to your query, for example, <em>name:Anonymous</em> or <em>subject:"Some Thread"</em>. Wildcards cannot be used in filters.{% endtrans %}
{% if flags_enabled %}
{% trans %}Search is case-insensitive and based on keywords. To match exact phrases, use "quotes". Use an asterisk (*) for wildcard.</p><p style="font-size:8pt;margin:5px">You may apply the following filters to your searches: <strong>id</strong>, <strong>thread</strong>, <strong>subject</strong>, <strong>name</strong>, <strong>flag</strong> and <strong>board</strong> (as an alternative syntax). To apply a filter, simply add to your query, for example, <em>name:Anonymous</em> or <em>subject:"Some Thread"</em>. The <strong>id</strong>, <strong>thread</strong> and <strong>board</strong> filters do not support wildcards.{% endtrans %}
{% else %}
{% trans %}Search is case-insensitive and based on keywords. To match exact phrases, use "quotes". Use an asterisk (*) for wildcard.</p><p style="font-size:8pt;margin:5px">You may apply the following filters to your searches: <strong>id</strong>, <strong>thread</strong>, <strong>subject</strong>, <strong>name</strong> and <strong>board</strong> (as an alternative syntax). To apply a filter, simply add to your query, for example, <em>name:Anonymous</em> or <em>subject:"Some Thread"</em>. The <strong>id</strong>, <strong>thread</strong> and <strong>board</strong> filters do not support wildcards.{% endtrans %}
{% endif %}
</p>
</div>

View file

@ -3,7 +3,7 @@
* Performs maintenance tasks. Invoke this periodically if the auto_maintenance configuration option is turned off.
*/
use Vichan\Data\ReportQueries;
use Vichan\Data\{ReportQueries, SearchQueries};
require dirname(__FILE__) . '/inc/cli.php';
@ -45,9 +45,17 @@ if ($config['cache']['enabled'] === 'fs') {
$fs_cache->collect();
$delta = microtime(true) - $start;
echo "Deleted $deleted_count expired filesystem cache items in $delta seconds!\n";
$time_tot = $delta;
$time_tot += $delta;
$deleted_tot = $deleted_count;
}
echo "Clearing old search log...\n";
$search_queries = $ctx->get(SearchQueries::class);
$start = microtime(true);
$deleted_count = $search_queries->purgeExpired();
$delta = microtime(true) - $start;
$time_tot += $delta;
$deleted_tot = $deleted_count;
$time_tot = number_format((float)$time_tot, 4, '.', '');
modLog("Deleted $deleted_tot expired entries in {$time_tot}s with maintenance tool");