This commit is contained in:
Marek
2026-03-24 00:04:55 +01:00
commit c5229e48ed
4225 changed files with 511461 additions and 0 deletions

View File

@@ -0,0 +1,77 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
use function sprintf;
final class CliHighlighter implements Highlighter
{
public const HIGHLIGHT_FUNCTIONS = 'functions';
/** @var array<string, string> */
private array $escapeSequences;
/** @param array<string, string> $escapeSequences */
public function __construct(array $escapeSequences = [])
{
$this->escapeSequences = $escapeSequences + [
self::HIGHLIGHT_QUOTE => "\x1b[34;1m",
self::HIGHLIGHT_BACKTICK_QUOTE => "\x1b[35;1m",
self::HIGHLIGHT_RESERVED => "\x1b[37m",
self::HIGHLIGHT_BOUNDARY => '',
self::HIGHLIGHT_NUMBER => "\x1b[32;1m",
self::HIGHLIGHT_WORD => '',
self::HIGHLIGHT_ERROR => "\x1b[31;1;7m",
self::HIGHLIGHT_COMMENT => "\x1b[30;1m",
self::HIGHLIGHT_VARIABLE => "\x1b[36;1m",
self::HIGHLIGHT_FUNCTIONS => "\x1b[37m",
];
}
public function highlightToken(int $type, string $value): string
{
if ($type === Token::TOKEN_TYPE_BOUNDARY && ($value === '(' || $value === ')')) {
return $value;
}
$prefix = $this->prefix($type);
if ($prefix === null) {
return $value;
}
return $prefix . $value . "\x1b[0m";
}
/** @param Token::TOKEN_TYPE_* $type */
private function prefix(int $type): string|null
{
if (! isset(self::TOKEN_TYPE_TO_HIGHLIGHT[$type])) {
return null;
}
return $this->escapeSequences[self::TOKEN_TYPE_TO_HIGHLIGHT[$type]];
}
public function highlightError(string $value): string
{
return sprintf(
'%s%s%s%s',
"\n",
$this->escapeSequences[self::HIGHLIGHT_ERROR],
$value,
"\x1b[0m",
);
}
public function highlightErrorMessage(string $value): string
{
return $this->highlightError($value);
}
public function output(string $string): string
{
return $string . "\n";
}
}

View File

@@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
final class Cursor
{
private int $position = -1;
/** @param list<Token> $tokens */
public function __construct(
private readonly array $tokens,
) {
}
/** @param Token::TOKEN_TYPE_* $exceptTokenType */
public function next(int|null $exceptTokenType = null): Token|null
{
while ($token = $this->tokens[++$this->position] ?? null) {
if ($exceptTokenType !== null && $token->isOfType($exceptTokenType)) {
continue;
}
return $token;
}
return null;
}
/** @param Token::TOKEN_TYPE_* $exceptTokenType */
public function previous(int|null $exceptTokenType = null): Token|null
{
while ($token = $this->tokens[--$this->position] ?? null) {
if ($exceptTokenType !== null && $token->isOfType($exceptTokenType)) {
continue;
}
return $token;
}
return null;
}
public function subCursor(): self
{
$cursor = new self($this->tokens);
$cursor->position = $this->position;
return $cursor;
}
}

View File

@@ -0,0 +1,58 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
interface Highlighter
{
public const TOKEN_TYPE_TO_HIGHLIGHT = [
Token::TOKEN_TYPE_BOUNDARY => self::HIGHLIGHT_BOUNDARY,
Token::TOKEN_TYPE_WORD => self::HIGHLIGHT_WORD,
Token::TOKEN_TYPE_BACKTICK_QUOTE => self::HIGHLIGHT_BACKTICK_QUOTE,
Token::TOKEN_TYPE_QUOTE => self::HIGHLIGHT_QUOTE,
Token::TOKEN_TYPE_RESERVED => self::HIGHLIGHT_RESERVED,
Token::TOKEN_TYPE_RESERVED_TOPLEVEL => self::HIGHLIGHT_RESERVED,
Token::TOKEN_TYPE_RESERVED_NEWLINE => self::HIGHLIGHT_RESERVED,
Token::TOKEN_TYPE_NUMBER => self::HIGHLIGHT_NUMBER,
Token::TOKEN_TYPE_VARIABLE => self::HIGHLIGHT_VARIABLE,
Token::TOKEN_TYPE_COMMENT => self::HIGHLIGHT_COMMENT,
Token::TOKEN_TYPE_BLOCK_COMMENT => self::HIGHLIGHT_COMMENT,
];
public const HIGHLIGHT_BOUNDARY = 'boundary';
public const HIGHLIGHT_WORD = 'word';
public const HIGHLIGHT_BACKTICK_QUOTE = 'backtickQuote';
public const HIGHLIGHT_QUOTE = 'quote';
public const HIGHLIGHT_RESERVED = 'reserved';
public const HIGHLIGHT_NUMBER = 'number';
public const HIGHLIGHT_VARIABLE = 'variable';
public const HIGHLIGHT_COMMENT = 'comment';
public const HIGHLIGHT_ERROR = 'error';
/**
* Highlights a token depending on its type.
*
* @param Token::TOKEN_TYPE_* $type
*/
public function highlightToken(int $type, string $value): string;
/**
* Highlights a token which causes an issue
*/
public function highlightError(string $value): string;
/**
* Highlights an error message
*/
public function highlightErrorMessage(string $value): string;
/**
* Helper function for building string output
*
* @param string $string The string to be quoted
*
* @return string The quoted string
*/
public function output(string $string): string;
}

View File

@@ -0,0 +1,93 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
use function htmlentities;
use function sprintf;
use function trim;
use const ENT_COMPAT;
use const ENT_IGNORE;
final class HtmlHighlighter implements Highlighter
{
public const HIGHLIGHT_PRE = 'pre';
/** @var array<string, string> */
private readonly array $htmlAttributes;
/**
* @param array<string, string> $htmlAttributes
* @param bool $usePre This flag tells us if queries need to be enclosed in <pre> tags
*/
public function __construct(
array $htmlAttributes = [],
private readonly bool $usePre = true,
) {
$this->htmlAttributes = $htmlAttributes + [
self::HIGHLIGHT_QUOTE => 'style="color: blue;"',
self::HIGHLIGHT_BACKTICK_QUOTE => 'style="color: purple;"',
self::HIGHLIGHT_RESERVED => 'style="font-weight:bold;"',
self::HIGHLIGHT_BOUNDARY => '',
self::HIGHLIGHT_NUMBER => 'style="color: green;"',
self::HIGHLIGHT_WORD => 'style="color: #333;"',
self::HIGHLIGHT_ERROR => 'style="background-color: red;"',
self::HIGHLIGHT_COMMENT => 'style="color: #aaa;"',
self::HIGHLIGHT_VARIABLE => 'style="color: orange;"',
self::HIGHLIGHT_PRE => 'style="color: black; background-color: white;"',
];
}
public function highlightToken(int $type, string $value): string
{
$value = htmlentities($value, ENT_COMPAT | ENT_IGNORE, 'UTF-8');
if ($type === Token::TOKEN_TYPE_BOUNDARY && ($value === '(' || $value === ')')) {
return $value;
}
$attributes = $this->attributes($type);
if ($attributes === null) {
return $value;
}
return '<span ' . $attributes . '>' . $value . '</span>';
}
/** @param Token::TOKEN_TYPE_* $type */
public function attributes(int $type): string|null
{
if (! isset(self::TOKEN_TYPE_TO_HIGHLIGHT[$type])) {
return null;
}
return $this->htmlAttributes[self::TOKEN_TYPE_TO_HIGHLIGHT[$type]];
}
public function highlightError(string $value): string
{
return sprintf(
'%s<span %s>%s</span>',
"\n",
$this->htmlAttributes[self::HIGHLIGHT_ERROR],
$value,
);
}
public function highlightErrorMessage(string $value): string
{
return $this->highlightError($value);
}
public function output(string $string): string
{
$string = trim($string);
if (! $this->usePre) {
return $string;
}
return '<pre ' . $this->htmlAttributes[self::HIGHLIGHT_PRE] . '>' . $string . '</pre>';
}
}

View File

@@ -0,0 +1,28 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
final class NullHighlighter implements Highlighter
{
public function highlightToken(int $type, string $value): string
{
return $value;
}
public function highlightError(string $value): string
{
return $value;
}
public function highlightErrorMessage(string $value): string
{
return ' ' . $value;
}
public function output(string $string): string
{
return $string;
}
}

View File

@@ -0,0 +1,485 @@
<?php
declare(strict_types=1);
/**
* SQL Formatter is a collection of utilities for debugging SQL queries.
* It includes methods for formatting, syntax highlighting, removing comments, etc.
*
* @link http://github.com/jdorn/sql-formatter
*/
namespace Doctrine\SqlFormatter;
use function array_pop;
use function array_search;
use function assert;
use function end;
use function in_array;
use function preg_replace;
use function rtrim;
use function str_repeat;
use function str_replace;
use function strlen;
use function strtoupper;
use function substr;
use function trim;
use const PHP_SAPI;
final class SqlFormatter
{
private readonly Highlighter $highlighter;
private readonly Tokenizer $tokenizer;
private const INDENT_TYPE_BLOCK = 'block';
private const INDENT_TYPE_SPECIAL = 'special';
public function __construct(Highlighter|null $highlighter = null)
{
$this->tokenizer = new Tokenizer();
$this->highlighter = $highlighter ?? (PHP_SAPI === 'cli' ? new CliHighlighter() : new HtmlHighlighter());
}
/**
* Format the whitespace in a SQL string to make it easier to read.
*
* @param string $string The SQL string
*
* @return string The SQL string with HTML styles and formatting wrapped in a <pre> tag
*/
public function format(string $string, string $indentString = ' '): string
{
// This variable will be populated with formatted html
$return = '';
// Use an actual tab while formatting and then switch out with $indentString at the end
$tab = "\t";
$indentLevel = 0;
$newline = false;
$inlineParentheses = false;
$increaseSpecialIndent = false;
$increaseBlockIndent = false;
$indentTypes = [];
$addedNewline = false;
$inlineCount = 0;
$inlineIndented = false;
$clauseLimit = false;
$appendNewLineIfNotAddedFx = static function () use (&$addedNewline, &$return, $tab, &$indentLevel): void {
// Add a newline if not already added
if ($addedNewline) { // @phpstan-ignore if.alwaysFalse
return;
}
$return = rtrim($return, ' ' . $tab);
$return .= "\n" . str_repeat($tab, $indentLevel);
};
$decreaseIndentationLevelFx = static function () use (&$return, &$indentTypes, $tab, &$indentLevel): void {
array_pop($indentTypes);
$indentLevel--;
// Redo the indentation since it may be different now
$lastPossiblyIndentLine = substr($return, -($indentLevel + 2));
if (rtrim($lastPossiblyIndentLine, $tab) !== "\n") {
return;
}
$rtrimLength = $indentLevel + 1;
while (substr($return, -($rtrimLength + 2), 1) === "\n") {
$rtrimLength++;
}
$return = substr($return, 0, -$rtrimLength) . str_repeat($tab, $indentLevel);
};
// Tokenize String
$cursor = $this->tokenizer->tokenize($string);
// Format token by token
while ($token = $cursor->next(Token::TOKEN_TYPE_WHITESPACE)) {
$prevNotWhitespaceToken = $cursor->subCursor()->previous(Token::TOKEN_TYPE_WHITESPACE);
$tokenValueUpper = strtoupper($token->value());
if ($prevNotWhitespaceToken !== null && $prevNotWhitespaceToken->value() === '.') {
$tokenValueUpper = false;
}
$highlighted = $this->highlighter->highlightToken(
$token->type(),
$token->value(),
);
// If we are increasing the special indent level now
if ($increaseSpecialIndent) {
$indentLevel++;
$increaseSpecialIndent = false;
$indentTypes[] = self::INDENT_TYPE_SPECIAL;
}
// If we are increasing the block indent level now
if ($increaseBlockIndent) {
$indentLevel++;
$increaseBlockIndent = false;
$indentTypes[] = self::INDENT_TYPE_BLOCK;
}
// If we need a new line before the token
if ($newline) {
$return = rtrim($return, ' ');
if ($prevNotWhitespaceToken !== null && $prevNotWhitespaceToken->value() === ';') {
$return .= "\n";
}
$return .= "\n" . str_repeat($tab, $indentLevel);
$newline = false;
$addedNewline = true;
} else {
$addedNewline = false;
}
// Display comments directly where they appear in the source
if ($token->isOfType(Token::TOKEN_TYPE_COMMENT, Token::TOKEN_TYPE_BLOCK_COMMENT)) {
if ($token->isOfType(Token::TOKEN_TYPE_BLOCK_COMMENT)) {
$indent = str_repeat($tab, $indentLevel);
$return = rtrim($return, ' ' . $tab);
$return .= "\n" . $indent;
$highlighted = str_replace("\n", "\n" . $indent, $highlighted);
}
$return .= $highlighted;
$newline = true;
continue;
}
if ($inlineParentheses) {
// End of inline parentheses
if ($token->value() === ')') {
$return = rtrim($return, ' ');
if ($inlineIndented) {
$decreaseIndentationLevelFx();
$return = rtrim($return, ' ');
$return .= "\n" . str_repeat($tab, $indentLevel);
}
$inlineParentheses = false;
$return .= $highlighted . ' ';
continue;
}
if ($token->value() === ',') {
if ($inlineCount >= 30) {
$inlineCount = 0;
$newline = true;
}
}
$inlineCount += strlen($token->value());
}
// Opening parentheses increase the block indent level and start a new line
if ($token->value() === '(') {
// First check if this should be an inline parentheses block
// Examples are "NOW()", "COUNT(*)", "int(10)", key(`somecolumn`), DECIMAL(7,2)
// Allow up to 3 non-whitespace tokens inside inline parentheses
$length = 0;
$subCursor = $cursor->subCursor();
for ($j = 1; $j <= 250; $j++) {
// Reached end of string
$next = $subCursor->next(Token::TOKEN_TYPE_WHITESPACE);
if ($next === null) {
break;
}
// Reached closing parentheses, able to inline it
if ($next->value() === ')') {
$inlineParentheses = true;
$inlineCount = 0;
$inlineIndented = false;
break;
}
// Reached an invalid token for inline parentheses
if ($next->value() === ';' || $next->value() === '(') {
break;
}
// Reached an invalid token type for inline parentheses
if (
$next->isOfType(
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
Token::TOKEN_TYPE_RESERVED_NEWLINE,
Token::TOKEN_TYPE_COMMENT,
Token::TOKEN_TYPE_BLOCK_COMMENT,
)
) {
break;
}
$length += strlen($next->value());
}
if ($inlineParentheses && $length > 30) {
$increaseBlockIndent = true;
$inlineIndented = true;
$newline = true;
}
// Take out the preceding space unless there was whitespace there in the original query
$prevToken = $cursor->subCursor()->previous();
if ($prevToken !== null && ! $prevToken->isOfType(Token::TOKEN_TYPE_WHITESPACE)) {
$return = rtrim($return, ' ');
}
if (! $inlineParentheses) {
$increaseBlockIndent = true;
// Add a newline after the parentheses
$newline = true;
}
} elseif ($token->value() === ')') {
// Closing parentheses decrease the block indent level
// Remove whitespace before the closing parentheses
$return = rtrim($return, ' ');
while (end($indentTypes) === self::INDENT_TYPE_SPECIAL) {
$decreaseIndentationLevelFx();
}
$decreaseIndentationLevelFx();
if ($indentLevel < 0) {
// This is an error
$indentLevel = 0;
$return .= $this->highlighter->highlightError($token->value());
continue;
}
$appendNewLineIfNotAddedFx();
} elseif ($token->isOfType(Token::TOKEN_TYPE_RESERVED_TOPLEVEL)) {
// Top level reserved words start a new line and increase the special indent level
$increaseSpecialIndent = true;
// If the last indent type was special, decrease the special indent for this round
if (end($indentTypes) === self::INDENT_TYPE_SPECIAL) {
$decreaseIndentationLevelFx();
}
// Add a newline after the top level reserved word
$newline = true;
$appendNewLineIfNotAddedFx();
if ($token->hasExtraWhitespace()) {
$highlighted = preg_replace('/\s+/', ' ', $highlighted);
}
// if SQL 'LIMIT' clause, start variable to reset newline
if ($tokenValueUpper === 'LIMIT' && ! $inlineParentheses) {
$clauseLimit = true;
}
} elseif ($token->value() === ';') {
// If the last indent type was special, decrease the special indent for this round
if (end($indentTypes) === self::INDENT_TYPE_SPECIAL) {
$decreaseIndentationLevelFx();
}
$newline = true;
} elseif ($tokenValueUpper === 'CASE') {
$increaseBlockIndent = true;
} elseif ($tokenValueUpper === 'BEGIN') {
$newline = true;
$increaseBlockIndent = true;
} elseif ($tokenValueUpper === 'LOOP') {
// https://docs.oracle.com/en/database/oracle/oracle-database/19/lnpls/basic-LOOP-statement.html
if ($prevNotWhitespaceToken !== null && strtoupper($prevNotWhitespaceToken->value()) !== 'END') {
$newline = true;
$increaseBlockIndent = true;
}
} elseif (in_array($tokenValueUpper, ['WHEN', 'THEN', 'ELSE', 'END'], true)) {
if ($tokenValueUpper !== 'THEN') {
$decreaseIndentationLevelFx();
if ($prevNotWhitespaceToken !== null && strtoupper($prevNotWhitespaceToken->value()) !== 'CASE') {
$appendNewLineIfNotAddedFx();
}
}
if ($tokenValueUpper === 'THEN' || $tokenValueUpper === 'ELSE') {
$newline = true;
$increaseBlockIndent = true;
}
} elseif (
$clauseLimit &&
$token->value() !== ',' &&
! $token->isOfType(Token::TOKEN_TYPE_NUMBER, Token::TOKEN_TYPE_WHITESPACE)
) {
// Checks if we are out of the limit clause
$clauseLimit = false;
} elseif ($token->value() === ',' && ! $inlineParentheses) {
// Commas start a new line (unless within inline parentheses or SQL 'LIMIT' clause)
// If the previous TOKEN_VALUE is 'LIMIT', resets new line
if ($clauseLimit === true) {
$newline = false;
$clauseLimit = false;
} else {
// All other cases of commas
$newline = true;
}
} elseif ($token->isOfType(Token::TOKEN_TYPE_RESERVED_NEWLINE)) {
// Newline reserved words start a new line
$appendNewLineIfNotAddedFx();
if ($token->hasExtraWhitespace()) {
$highlighted = preg_replace('/\s+/', ' ', $highlighted);
}
} elseif ($token->isOfType(Token::TOKEN_TYPE_BOUNDARY)) {
// Multiple boundary characters in a row should not have spaces between them (not including parentheses)
if ($prevNotWhitespaceToken !== null && $prevNotWhitespaceToken->isOfType(Token::TOKEN_TYPE_BOUNDARY)) {
$prevToken = $cursor->subCursor()->previous();
if ($prevToken !== null && ! $prevToken->isOfType(Token::TOKEN_TYPE_WHITESPACE)) {
$return = rtrim($return, ' ');
}
}
}
// If the token shouldn't have a space before it
if (
$token->value() === '.' ||
$token->value() === ',' ||
$token->value() === ';'
) {
$return = rtrim($return, ' ');
}
$return .= $highlighted . ' ';
// If the token shouldn't have a space after it
if ($token->value() === '(' || $token->value() === '.') {
$return = rtrim($return, ' ');
}
// If this is the "-" of a negative number, it shouldn't have a space after it
if ($token->value() !== '-') {
continue;
}
$nextNotWhitespace = $cursor->subCursor()->next(Token::TOKEN_TYPE_WHITESPACE);
if ($nextNotWhitespace === null || ! $nextNotWhitespace->isOfType(Token::TOKEN_TYPE_NUMBER)) {
continue;
}
$prev = $cursor->subCursor()->previous(Token::TOKEN_TYPE_WHITESPACE);
if ($prev === null) {
continue;
}
if (
$prev->isOfType(
Token::TOKEN_TYPE_QUOTE,
Token::TOKEN_TYPE_BACKTICK_QUOTE,
Token::TOKEN_TYPE_WORD,
Token::TOKEN_TYPE_NUMBER,
)
) {
continue;
}
$return = rtrim($return, ' ');
}
// If there are unmatched parentheses
if (array_search(self::INDENT_TYPE_BLOCK, $indentTypes) !== false) {
$return = rtrim($return, ' ');
$return .= $this->highlighter->highlightErrorMessage(
'WARNING: unclosed parentheses or section',
);
}
// Replace tab characters with the configuration tab character
$return = trim(str_replace($tab, $indentString, $return));
return $this->highlighter->output($return);
}
/**
* Add syntax highlighting to a SQL string
*
* @param string $string The SQL string
*
* @return string The SQL string with HTML styles applied
*/
public function highlight(string $string): string
{
$cursor = $this->tokenizer->tokenize($string);
$return = '';
while ($token = $cursor->next()) {
$return .= $this->highlighter->highlightToken(
$token->type(),
$token->value(),
);
}
return $this->highlighter->output($return);
}
/**
* Compress a query by collapsing white space and removing comments
*
* @param string $string The SQL string
*
* @return string The SQL string without comments
*/
public function compress(string $string): string
{
$result = '';
$cursor = $this->tokenizer->tokenize($string);
$whitespace = true;
while ($token = $cursor->next()) {
// Skip comment tokens
if ($token->isOfType(Token::TOKEN_TYPE_COMMENT, Token::TOKEN_TYPE_BLOCK_COMMENT)) {
continue;
}
// Remove extra whitespace in reserved words (e.g "OUTER JOIN" becomes "OUTER JOIN")
if (
$token->isOfType(
Token::TOKEN_TYPE_RESERVED,
Token::TOKEN_TYPE_RESERVED_NEWLINE,
Token::TOKEN_TYPE_RESERVED_TOPLEVEL,
)
) {
$newValue = preg_replace('/\s+/', ' ', $token->value());
assert($newValue !== null);
$token = $token->withValue($newValue);
}
if ($token->isOfType(Token::TOKEN_TYPE_WHITESPACE)) {
// If the last token was whitespace, don't add another one
if ($whitespace) {
continue;
}
$whitespace = true;
// Convert all whitespace to a single space
$token = $token->withValue(' ');
} else {
$whitespace = false;
}
$result .= $token->value();
}
return rtrim($result);
}
}

View File

@@ -0,0 +1,63 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
use function assert;
use function in_array;
use function str_contains;
final class Token
{
// Constants for token types
public const TOKEN_TYPE_WHITESPACE = 0;
public const TOKEN_TYPE_WORD = 1;
public const TOKEN_TYPE_QUOTE = 2;
public const TOKEN_TYPE_BACKTICK_QUOTE = 3;
public const TOKEN_TYPE_RESERVED = 4;
public const TOKEN_TYPE_RESERVED_TOPLEVEL = 5;
public const TOKEN_TYPE_RESERVED_NEWLINE = 6;
public const TOKEN_TYPE_BOUNDARY = 7;
public const TOKEN_TYPE_COMMENT = 8;
public const TOKEN_TYPE_BLOCK_COMMENT = 9;
public const TOKEN_TYPE_NUMBER = 10;
public const TOKEN_TYPE_VARIABLE = 11;
/** @param self::TOKEN_TYPE_* $type */
public function __construct(
private readonly int $type,
private readonly string $value,
) {
assert($value !== '');
}
public function value(): string
{
return $this->value;
}
/** @return self::TOKEN_TYPE_* */
public function type(): int
{
return $this->type;
}
/** @param self::TOKEN_TYPE_* ...$types */
public function isOfType(int ...$types): bool
{
return in_array($this->type, $types, true);
}
public function hasExtraWhitespace(): bool
{
return str_contains($this->value(), ' ') ||
str_contains($this->value(), "\n") ||
str_contains($this->value(), "\t");
}
public function withValue(string $value): self
{
return new self($this->type(), $value);
}
}

View File

@@ -0,0 +1,931 @@
<?php
declare(strict_types=1);
namespace Doctrine\SqlFormatter;
use function array_key_last;
use function array_map;
use function array_pop;
use function assert;
use function count;
use function implode;
use function is_int;
use function preg_match;
use function preg_quote;
use function reset;
use function str_replace;
use function str_starts_with;
use function strlen;
use function strtoupper;
use function substr;
use function usort;
/** @internal */
final class Tokenizer
{
/**
* Reserved words (for syntax highlighting)
*
* @var list<string>
*/
private array $reserved = [
'ACCESSIBLE',
'ACTION',
'ADD',
'AFTER',
'AGAINST',
'AGGREGATE',
'ALGORITHM',
'ALL',
'ALTER',
'ANALYSE',
'ANALYZE',
'AND',
'AS',
'ASC',
'AUTOCOMMIT',
'AUTO_INCREMENT',
'BACKUP',
'BEGIN',
'BETWEEN',
'BIGINT',
'BINARY',
'BINLOG',
'BLOB',
'BOTH',
'BY',
'CASCADE',
'CASE',
'CHANGE',
'CHANGED',
'CHAR',
'CHARACTER',
'CHARSET',
'CHECK',
'CHECKSUM',
'COLLATE',
'COLLATION',
'COLUMN',
'COLUMNS',
'COMMENT',
'COMMIT',
'COMMITTED',
'COMPRESSED',
'CONCURRENT',
'CONSTRAINT',
'CONTAINS',
'CONVERT',
'CREATE',
'CROSS',
'CURRENT',
'CURRENT_TIMESTAMP',
'DATABASE',
'DATABASES',
'DAY',
'DAY_HOUR',
'DAY_MINUTE',
'DAY_SECOND',
'DECIMAL',
'DEFAULT',
'DEFINER',
'DELAYED',
'DELETE',
'DESC',
'DESCRIBE',
'DETERMINISTIC',
'DISTINCT',
'DISTINCTROW',
'DIV',
'DO',
'DOUBLE',
'DROP',
'DUMPFILE',
'DUPLICATE',
'DYNAMIC',
'ELSE',
'ENCLOSED',
'END',
'ENGINE',
'ENGINES',
'ENGINE_TYPE',
'ESCAPE',
'ESCAPED',
'EVENTS',
'EXCEPT',
'EXCLUDE',
'EXEC',
'EXECUTE',
'EXISTS',
'EXPLAIN',
'EXTENDED',
'FALSE',
'FAST',
'FETCH',
'FIELDS',
'FILE',
'FILTER',
'FIRST',
'FIXED',
'FLOAT',
'FLOAT4',
'FLOAT8',
'FLUSH',
'FOLLOWING',
'FOR',
'FORCE',
'FOREIGN',
'FROM',
'FULL',
'FULLTEXT',
'FUNCTION',
'GLOBAL',
'GRANT',
'GRANTS',
'GROUP',
'GROUPS',
'HAVING',
'HEAP',
'HIGH_PRIORITY',
'HOSTS',
'HOUR',
'HOUR_MINUTE',
'HOUR_SECOND',
'IDENTIFIED',
'IF',
'IFNULL',
'IGNORE',
'IN',
'INDEX',
'INDEXES',
'INFILE',
'INNER',
'INSERT',
'INSERT_ID',
'INSERT_METHOD',
'INT',
'INT1',
'INT2',
'INT3',
'INT4',
'INT8',
'INTEGER',
'INTERSECT',
'INTERVAL',
'INTO',
'INVOKER',
'IS',
'ISOLATION',
'JOIN',
'KEY',
'KEYS',
'KILL',
'LAST_INSERT_ID',
'LEADING',
'LEFT',
'LEVEL',
'LIKE',
'LIMIT',
'LINEAR',
'LINES',
'LOAD',
'LOCAL',
'LOCK',
'LOCKS',
'LOGS',
'LONG',
'LONGBLOB',
'LONGTEXT',
'LOW_PRIORITY',
'MARIA',
'MASTER',
'MASTER_CONNECT_RETRY',
'MASTER_HOST',
'MASTER_LOG_FILE',
'MATCH',
'MAX_CONNECTIONS_PER_HOUR',
'MAX_QUERIES_PER_HOUR',
'MAX_ROWS',
'MAX_UPDATES_PER_HOUR',
'MAX_USER_CONNECTIONS',
'MEDIUM',
'MEDIUMBLOB',
'MEDIUMINT',
'MEDIUMTEXT',
'MERGE',
'MINUTE',
'MINUTE_SECOND',
'MIN_ROWS',
'MODE',
'MODIFY',
'MONTH',
'MRG_MYISAM',
'MYISAM',
'NAMES',
'NATURAL',
'NOT',
'NULL',
'NUMERIC',
'OFFSET',
'ON',
'OPEN',
'OPTIMIZE',
'OPTION',
'OPTIONALLY',
'OR',
'ORDER',
'OUTER',
'OUTFILE',
'OVER',
'PACK_KEYS',
'PAGE',
'PARTIAL',
'PARTITION',
'PARTITIONS',
'PASSWORD',
'PRECEDING',
'PRIMARY',
'PRIVILEGES',
'PROCEDURE',
'PROCESS',
'PROCESSLIST',
'PURGE',
'QUICK',
'RAID0',
'RAID_CHUNKS',
'RAID_CHUNKSIZE',
'RAID_TYPE',
'RANGE',
'READ',
'READ_ONLY',
'READ_WRITE',
'REAL',
'RECURSIVE',
'REFERENCES',
'REGEXP',
'RELOAD',
'RENAME',
'REPAIR',
'REPEATABLE',
'REPLACE',
'REPLICATION',
'RESET',
'RESTORE',
'RESTRICT',
'RETURN',
'RETURNS',
'REVOKE',
'RIGHT',
'RLIKE',
'ROLLBACK',
'ROW',
'ROWS',
'ROW_FORMAT',
'SECOND',
'SECURITY',
'SELECT',
'SEPARATOR',
'SERIALIZABLE',
'SESSION',
'SET',
'SHARE',
'SHOW',
'SHUTDOWN',
'SLAVE',
'SMALLINT',
'SONAME',
'SOUNDS',
'SQL',
'SQL_AUTO_IS_NULL',
'SQL_BIG_RESULT',
'SQL_BIG_SELECTS',
'SQL_BIG_TABLES',
'SQL_BUFFER_RESULT',
'SQL_CACHE',
'SQL_CALC_FOUND_ROWS',
'SQL_LOG_BIN',
'SQL_LOG_OFF',
'SQL_LOG_UPDATE',
'SQL_LOW_PRIORITY_UPDATES',
'SQL_MAX_JOIN_SIZE',
'SQL_NO_CACHE',
'SQL_QUOTE_SHOW_CREATE',
'SQL_SAFE_UPDATES',
'SQL_SELECT_LIMIT',
'SQL_SLAVE_SKIP_COUNTER',
'SQL_SMALL_RESULT',
'SQL_WARNINGS',
'START',
'STARTING',
'STATUS',
'STOP',
'STORAGE',
'STRAIGHT_JOIN',
'STRING',
'STRIPED',
'SUPER',
'TABLE',
'TABLES',
'TEMPORARY',
'TERMINATED',
'THEN',
'TIES',
'TINYBLOB',
'TINYINT',
'TINYTEXT',
'TO',
'TRAILING',
'TRANSACTIONAL',
'TRUE',
'TRUNCATE',
'TYPE',
'TYPES',
'UNBOUNDED',
'UNCOMMITTED',
'UNION',
'UNIQUE',
'UNLOCK',
'UNSIGNED',
'UPDATE',
'USAGE',
'USE',
'USING',
'VALUES',
'VARBINARY',
'VARCHAR',
'VARCHARACTER',
'VARIABLES',
'VIEW',
'WHEN',
'WHERE',
'WINDOW',
'WITH',
'WORK',
'WRITE',
'XOR',
'YEAR_MONTH',
];
/**
* For SQL formatting
* These keywords will all be on their own line
*
* @var list<string>
*/
private array $reservedToplevel = [
'ADD',
'ALTER TABLE',
'CHANGE',
'DELETE FROM',
'DROP',
'EXCEPT',
'FETCH',
'FROM',
'GROUP BY',
'GROUPS',
'HAVING',
'INTERSECT',
'LIMIT',
'MODIFY',
'OFFSET',
'ORDER BY',
'PARTITION BY',
'RANGE',
'ROWS',
'SELECT',
'SET',
'UNION',
'UNION ALL',
'UPDATE',
'VALUES',
'WHERE',
'WINDOW',
'WITH',
];
/** @var list<string> */
private array $reservedNewline = [
'AND',
'EXCLUDE',
'INNER JOIN',
'JOIN',
'LEFT JOIN',
'LEFT OUTER JOIN',
'OR',
'OUTER JOIN',
'RIGHT JOIN',
'RIGHT OUTER JOIN',
'STRAIGHT_JOIN',
'XOR',
];
/** @var list<string> */
private array $functions = [
'ABS',
'ACOS',
'ADDDATE',
'ADDTIME',
'AES_DECRYPT',
'AES_ENCRYPT',
'APPROX_COUNT_DISTINCT',
'AREA',
'ASBINARY',
'ASCII',
'ASIN',
'ASTEXT',
'ATAN',
'ATAN2',
'AVG',
'BDMPOLYFROMTEXT',
'BDMPOLYFROMWKB',
'BDPOLYFROMTEXT',
'BDPOLYFROMWKB',
'BENCHMARK',
'BIN',
'BIT_AND',
'BIT_COUNT',
'BIT_LENGTH',
'BIT_OR',
'BIT_XOR',
'BOUNDARY',
'BUFFER',
'CAST',
'CEIL',
'CEILING',
'CENTROID',
'CHARACTER_LENGTH',
'CHAR_LENGTH',
'CHECKSUM_AGG',
'COALESCE',
'COERCIBILITY',
'COMPRESS',
'CONCAT',
'CONCAT_WS',
'CONNECTION_ID',
'CONV',
'CONVERT_TZ',
'CONVEXHULL',
'COS',
'COT',
'COUNT',
'COUNT_BIG',
'CRC32',
'CROSSES',
'CUME_DIST',
'CURDATE',
'CURRENT_DATE',
'CURRENT_TIME',
'CURRENT_USER',
'CURTIME',
'DATE',
'DATEDIFF',
'DATE_ADD',
'DATE_DIFF',
'DATE_FORMAT',
'DATE_SUB',
'DAYNAME',
'DAYOFMONTH',
'DAYOFWEEK',
'DAYOFYEAR',
'DECODE',
'DEGREES',
'DENSE_RANK',
'DES_DECRYPT',
'DES_ENCRYPT',
'DIFFERENCE',
'DIMENSION',
'DISJOINT',
'DISTANCE',
'ELT',
'ENCODE',
'ENCRYPT',
'ENDPOINT',
'ENVELOPE',
'EQUALS',
'EXP',
'EXPORT_SET',
'EXTERIORRING',
'EXTRACT',
'EXTRACTVALUE',
'FIELD',
'FIND_IN_SET',
'FIRST_VALUE',
'FLOOR',
'FORMAT',
'FOUND_ROWS',
'FROM_DAYS',
'FROM_UNIXTIME',
'GEOMCOLLFROMTEXT',
'GEOMCOLLFROMWKB',
'GEOMETRYCOLLECTION',
'GEOMETRYCOLLECTIONFROMTEXT',
'GEOMETRYCOLLECTIONFROMWKB',
'GEOMETRYFROMTEXT',
'GEOMETRYFROMWKB',
'GEOMETRYN',
'GEOMETRYTYPE',
'GEOMFROMTEXT',
'GEOMFROMWKB',
'GET_FORMAT',
'GET_LOCK',
'GLENGTH',
'GREATEST',
'GROUPING',
'GROUPING_ID',
'GROUP_CONCAT',
'GROUP_UNIQUE_USERS',
'HEX',
'INET_ATON',
'INET_NTOA',
'INSTR',
'INTERIORRINGN',
'INTERSECTION',
'INTERSECTS',
'ISCLOSED',
'ISEMPTY',
'ISNULL',
'ISRING',
'ISSIMPLE',
'IS_FREE_LOCK',
'IS_USED_LOCK',
'LAG',
'LAST_DAY',
'LAST_VALUE',
'LCASE',
'LEAD',
'LEAST',
'LENGTH',
'LINEFROMTEXT',
'LINEFROMWKB',
'LINESTRING',
'LINESTRINGFROMTEXT',
'LINESTRINGFROMWKB',
'LISTAGG',
'LN',
'LOAD_FILE',
'LOCALTIME',
'LOCALTIMESTAMP',
'LOCATE',
'LOG',
'LOG10',
'LOG2',
'LOWER',
'LPAD',
'LTRIM',
'MAKEDATE',
'MAKETIME',
'MAKE_SET',
'MASTER_POS_WAIT',
'MAX',
'MBRCONTAINS',
'MBRDISJOINT',
'MBREQUAL',
'MBRINTERSECTS',
'MBROVERLAPS',
'MBRTOUCHES',
'MBRWITHIN',
'MD5',
'MICROSECOND',
'MID',
'MIN',
'MLINEFROMTEXT',
'MLINEFROMWKB',
'MOD',
'MONTHNAME',
'MPOINTFROMTEXT',
'MPOINTFROMWKB',
'MPOLYFROMTEXT',
'MPOLYFROMWKB',
'MULTILINESTRING',
'MULTILINESTRINGFROMTEXT',
'MULTILINESTRINGFROMWKB',
'MULTIPOINT',
'MULTIPOINTFROMTEXT',
'MULTIPOINTFROMWKB',
'MULTIPOLYGON',
'MULTIPOLYGONFROMTEXT',
'MULTIPOLYGONFROMWKB',
'NAME_CONST',
'NOW',
'NTH_VALUE',
'NTILE',
'NULLIF',
'NUMGEOMETRIES',
'NUMINTERIORRINGS',
'NUMPOINTS',
'OCT',
'OCTET_LENGTH',
'OLD_PASSWORD',
'ORD',
'OVERLAPS',
'PERCENTILE_CONT',
'PERCENTILE_DISC',
'PERCENT_RANK',
'PERIOD_ADD',
'PERIOD_DIFF',
'PI',
'POINT',
'POINTFROMTEXT',
'POINTFROMWKB',
'POINTN',
'POINTONSURFACE',
'POLYFROMTEXT',
'POLYFROMWKB',
'POLYGON',
'POLYGONFROMTEXT',
'POLYGONFROMWKB',
'POSITION',
'POW',
'POWER',
'QUARTER',
'QUOTE',
'RADIANS',
'RAND',
'RANK',
'RELATED',
'RELEASE_LOCK',
'REPEAT',
'REVERSE',
'ROUND',
'ROW_COUNT',
'ROW_NUMBER',
'RPAD',
'RTRIM',
'SCHEMA',
'SEC_TO_TIME',
'SESSION_USER',
'SHA',
'SHA1',
'SIGN',
'SIN',
'SLEEP',
'SOUNDEX',
'SPACE',
'SQRT',
'SRID',
'STARTPOINT',
'STD',
'STDDEV',
'STDDEV_POP',
'STDDEV_SAMP',
'STDEV',
'STDEVP',
'STRCMP',
'STRING_AGG',
'STR_TO_DATE',
'SUBDATE',
'SUBSTR',
'SUBSTRING',
'SUBSTRING_INDEX',
'SUBTIME',
'SUM',
'SYMDIFFERENCE',
'SYSDATE',
'SYSTEM_USER',
'TAN',
'TIME',
'TIMEDIFF',
'TIMESTAMP',
'TIMESTAMPADD',
'TIMESTAMPDIFF',
'TIME_FORMAT',
'TIME_TO_SEC',
'TOUCHES',
'TO_DAYS',
'TRIM',
'UCASE',
'UNCOMPRESS',
'UNCOMPRESSED_LENGTH',
'UNHEX',
'UNIQUE_USERS',
'UNIX_TIMESTAMP',
'UPDATEXML',
'UPPER',
'USER',
'UTC_DATE',
'UTC_TIME',
'UTC_TIMESTAMP',
'UUID',
'VAR',
'VARIANCE',
'VARP',
'VAR_POP',
'VAR_SAMP',
'VERSION',
'WEEK',
'WEEKDAY',
'WEEKOFYEAR',
'WITHIN',
'X',
'Y',
'YEAR',
'YEARWEEK',
];
/** Regular expression for tokenizing. */
private readonly string $tokenizeRegex;
/**
* Punctuation that can be used as a boundary between other tokens
*
* @var list<string>
*/
private array $boundaries = [
',',
';',
'::', // PostgreSQL cast operator
':',
')',
'(',
'.',
'=',
'<',
'>',
'+',
'-',
'~*', // https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
'*',
'/',
'!',
'^',
'%',
'|',
'&',
'#',
];
/**
* Stuff that only needs to be done once. Builds tokenizing regular expression.
*/
public function __construct()
{
$this->tokenizeRegex = $this->makeTokenizeRegex($this->makeTokenizeRegexes());
}
/**
* Make regex from a list of values matching longest value first.
*
* Optimized for speed by matching alternative branch only once
* https://github.com/PCRE2Project/pcre2/issues/411 .
*
* @param list<string> $values
*/
private function makeRegexFromList(array $values, bool $sorted = false): string
{
// sort list alphabetically and from longest word to shortest
if (! $sorted) {
usort($values, static function (string $a, string $b) {
return str_starts_with($a, $b) || str_starts_with($b, $a)
? strlen($b) <=> strlen($a)
: $a <=> $b;
});
}
/** @var array<int|string, list<string>> $valuesBySharedPrefix */
$valuesBySharedPrefix = [];
$items = [];
$prefix = null;
foreach ($values as $v) {
if ($prefix !== null && ! str_starts_with($v, substr($prefix, 0, 1))) {
$valuesBySharedPrefix[$prefix] = $items;
$items = [];
$prefix = null;
}
$items[] = $v;
if ($prefix === null) {
$prefix = $v;
} else {
while (! str_starts_with($v, $prefix)) {
$prefix = substr($prefix, 0, -1);
}
}
}
if ($items !== []) {
$valuesBySharedPrefix[(string) $prefix] = $items;
$items = [];
$prefix = null;
}
$regex = '(?>';
foreach ($valuesBySharedPrefix as $prefix => $items) {
if ($regex !== '(?>') {
$regex .= '|';
}
if (is_int($prefix)) {
$prefix = (string) $prefix;
}
$regex .= preg_quote($prefix);
$regex .= count($items) === 1
? preg_quote(substr(reset($items), strlen($prefix)))
: $this->makeRegexFromList(array_map(static fn ($v) => substr($v, strlen($prefix)), $items), true);
}
return $regex . ')';
}
/** @return array<Token::TOKEN_TYPE_*, string> */
private function makeTokenizeRegexes(): array
{
// Set up regular expressions
$regexBoundaries = $this->makeRegexFromList($this->boundaries);
$regexReserved = $this->makeRegexFromList($this->reserved);
$regexReservedToplevel = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedToplevel));
$regexReservedNewline = str_replace(' ', '\s+', $this->makeRegexFromList($this->reservedNewline));
$regexFunction = $this->makeRegexFromList($this->functions);
return [
Token::TOKEN_TYPE_WHITESPACE => '\s+',
Token::TOKEN_TYPE_COMMENT => '(?:--|#(?!>))[^\n]*+', // #>, #>> and <#> are PostgreSQL operators
Token::TOKEN_TYPE_BLOCK_COMMENT => '/\*(?:[^*]+|\*(?!/))*+(?:\*|$)(?:/|$)',
// 1. backtick quoted string using `` to escape
// 2. square bracket quoted string (SQL Server) using ]] to escape
Token::TOKEN_TYPE_BACKTICK_QUOTE => <<<'EOD'
(?>(?x)
`(?:[^`]+|`(?:`|$))*+(?:`|$)
|\[(?:[^\]]+|\](?:\]|$))*+(?:\]|$)
)
EOD,
// 3. double quoted string using "" or \" to escape
// 4. single quoted string using '' or \' to escape
Token::TOKEN_TYPE_QUOTE => <<<'EOD'
(?>(?sx)
'(?:[^'\\]+|\\(?:.|$)|'(?:'|$))*+(?:'|$)
|"(?:[^"\\]+|\\(?:.|$)|"(?:"|$))*+(?:"|$)
)
EOD,
// User-defined variable, possibly with quoted name
Token::TOKEN_TYPE_VARIABLE => '[@:](?:[\w.$]++|(?&t_' . Token::TOKEN_TYPE_BACKTICK_QUOTE . ')|(?&t_' . Token::TOKEN_TYPE_QUOTE . '))',
// decimal, binary, or hex
Token::TOKEN_TYPE_NUMBER => '(?:\d+(?:\.\d+)?|0x[\da-fA-F]+|0b[01]+)(?=$|\s|"\'`|' . $regexBoundaries . ')',
// punctuation and symbols
Token::TOKEN_TYPE_BOUNDARY => $regexBoundaries,
// A reserved word cannot be preceded by a '.'
// this makes it so in "mytable.from", "from" is not considered a reserved word
Token::TOKEN_TYPE_RESERVED_TOPLEVEL => '(?<!\.|\sCHARACTER\s(?=SET\s))' . $regexReservedToplevel . '(?=$|\s|' . $regexBoundaries . ')',
Token::TOKEN_TYPE_RESERVED_NEWLINE => '(?<!\.)' . $regexReservedNewline . '(?=$|\s|' . $regexBoundaries . ')',
Token::TOKEN_TYPE_RESERVED => '(?<!\.)' . $regexReserved . '(?=$|\s|' . $regexBoundaries . ')'
// A function must be succeeded by '('
// this makes it so "count(" is considered a function, but "count" alone is not function
. '|' . $regexFunction . '(?=\s*\()',
Token::TOKEN_TYPE_WORD => '.*?(?=$|\s|["\'`]|' . $regexBoundaries . ')',
];
}
/** @param array<Token::TOKEN_TYPE_*, string> $regexes */
private function makeTokenizeRegex(array $regexes): string
{
$parts = [];
foreach ($regexes as $type => $regex) {
$parts[] = '(?<t_' . $type . '>' . $regex . ')';
}
return '(\G(?:' . implode('|', $parts) . '))';
}
/**
* Takes a SQL string and breaks it into tokens.
* Each token is an associative array with type and value.
*
* @param string $string The SQL string
*/
public function tokenize(string $string): Cursor
{
$tokenizeRegex = $this->tokenizeRegex;
$upper = strtoupper($string);
$tokens = [];
$offset = 0;
while ($offset < strlen($string)) {
// Get the next token and the token type
preg_match($tokenizeRegex, $upper, $matches, 0, $offset);
assert(($matches[0] ?? '') !== '');
while (is_int($lastMatchesKey = array_key_last($matches))) {
array_pop($matches);
}
assert(str_starts_with($lastMatchesKey, 't_'));
/** @var Token::TOKEN_TYPE_* $tokenType */
$tokenType = (int) substr($lastMatchesKey, 2);
$token = new Token($tokenType, substr($string, $offset, strlen($matches[0])));
$offset += strlen($token->value());
$tokens[] = $token;
}
return new Cursor($tokens);
}
}