Skip to content

feat: support doris match_* operator --story=121938564 #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: v0.55.0-bkbase
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/ast/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,15 @@ pub enum BinaryOperator {
/// `~=` Same as? (PostgreSQL/Redshift geometric operator)
/// See <https://www.postgresql.org/docs/9.5/functions-geometry.html>
TildeEq,
/// Doris Match operator
/// See <https://doris.apache.org/zh-cn/docs/develop/sql-reference/operators/match/>
/// e.g. `a MATCH_* 'keyword1_xxxxxxx'`
MatchAll,
MatchAny,
MatchPhrase,
MatchPhrasePrefix,
MatchRegexp,
MatchPhraseEdge,
}

impl fmt::Display for BinaryOperator {
Expand Down Expand Up @@ -394,6 +403,12 @@ impl fmt::Display for BinaryOperator {
BinaryOperator::QuestionDoublePipe => f.write_str("?||"),
BinaryOperator::At => f.write_str("@"),
BinaryOperator::TildeEq => f.write_str("~="),
BinaryOperator::MatchAll => f.write_str("MATCH_ALL"),
BinaryOperator::MatchAny => f.write_str("MATCH_ANY"),
BinaryOperator::MatchPhrase => f.write_str("MATCH_PHRASE"),
BinaryOperator::MatchPhrasePrefix => f.write_str("MATCH_PHRASE_PREFIX"),
BinaryOperator::MatchRegexp => f.write_str("MATCH_REGEXP"),
BinaryOperator::MatchPhraseEdge => f.write_str("MATCH_PHRASE_EDGE"),
}
}
}
8 changes: 8 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,14 @@ pub trait Dialect: Debug + Any {
Token::Word(w) if w.keyword == Keyword::RLIKE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::REGEXP => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::SIMILAR => Ok(p!(Like)),
// Define Doris Match_* Operators: the same as LIKE precedence
Token::Word(w) if w.keyword == Keyword::MATCH_ALL => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::MATCH_ANY => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_PREFIX => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::MATCH_REGEXP => Ok(p!(Like)),
Token::Word(w) if w.keyword == Keyword::MATCH_PHRASE_EDGE => Ok(p!(Like)),
// End Doris Match_* Operators
Token::Word(w) if w.keyword == Keyword::OPERATOR => Ok(p!(Between)),
Token::Word(w) if w.keyword == Keyword::DIV => Ok(p!(MulDivModOp)),
Token::Period => Ok(p!(Period)),
Expand Down
6 changes: 6 additions & 0 deletions src/keywords.rs
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,14 @@ define_keywords!(
MATCH,
MATCHED,
MATCHES,
MATCH_ALL,
MATCH_ANY,
MATCH_CONDITION,
MATCH_PHRASE,
MATCH_PHRASE_EDGE,
MATCH_PHRASE_PREFIX,
MATCH_RECOGNIZE,
MATCH_REGEXP,
MATERIALIZE,
MATERIALIZED,
MAX,
Expand Down
78 changes: 78 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3227,6 +3227,21 @@ impl<'a> Parser<'a> {
self.expect_token(&Token::RParen)?;
Some(BinaryOperator::PGCustomBinaryOperator(idents))
}
// Doris match operators
Keyword::MATCH_ALL => dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchAll),
Keyword::MATCH_ANY => dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchAny),
Keyword::MATCH_PHRASE => dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchPhrase),
Keyword::MATCH_PHRASE_PREFIX => {
dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchPhrasePrefix)
}
Keyword::MATCH_PHRASE_EDGE => dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchPhraseEdge),
Keyword::MATCH_REGEXP => dialect_is!(dialect is MySqlDialect | GenericDialect)
.then_some(BinaryOperator::MatchRegexp),
_ => None,
},
_ => None,
Expand Down Expand Up @@ -15423,4 +15438,67 @@ mod tests {

assert!(Parser::parse_sql(&MySqlDialect {}, sql).is_err());
}

#[test]
fn test_doris_match_operators() {
let dialect = &MySqlDialect {};

// Copy from https://doris.apache.org/docs/table-design/index/inverted-index
let test_cases = [
// 1.1
"SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1';",
// 1.2
"SELECT * FROM table_name WHERE content MATCH_ANY 'keyword1 keyword2';",
// 1.3
"SELECT * FROM table_name WHERE content MATCH_ALL 'keyword1 keyword2';",
// 2.1
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2';",
// 2.2
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3';",
"SELECT * FROM table_name WHERE content MATCH_PHRASE 'keyword1 keyword2 ~3+';",
// 2.3
"SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1 keyword2';",
// 2.4
"SELECT * FROM table_name WHERE content MATCH_PHRASE_PREFIX 'keyword1';",
// 2.5
"SELECT * FROM table_name WHERE content MATCH_REGEXP 'key*';",
];

for sql in test_cases {
assert!(Parser::parse_sql(dialect, sql).is_ok());
}
}

#[test]
fn test_doris_match_precedence() {
let dialect = &MySqlDialect {};
// Test sql with and, or, equal, like, between ... and operator
let sql = "SELECT
id,
title,
content,
score * 2 AS weighted_score
FROM
documents
WHERE
content MATCH_ALL 'important concept theory'
OR (
author = 'Smith' AND content MATCH_ALL 'methodology approach'
)
AND (
(references > 10 AND citations MATCH_ALL 'credible source')
OR (importance = 'high' AND content MATCH_ALL 'breakthrough discovery')
)
AND (
(keywords LIKE '%analysis%' OR keywords MATCH_PHRASE 'evaluation')
AND (abstract MATCH_ALL 'systematic review' OR conclusion LIKE '%finding%')
)
AND publication_date BETWEEN '2020-01-01' AND '2023-12-31'
ORDER BY
weighted_score DESC,
publication_date DESC
LIMIT 50";

assert!(Parser::parse_sql(dialect, sql).is_ok());
}
}
56 changes: 56 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ pub enum Token {
DuckIntDiv,
/// Modulo Operator `%`
Mod,
/// MATCH_ALL
MatchAll,
/// MATCH_ANY
MatchAny,
/// MATCH_PHRASE
MatchPhrase,
/// MATCH_REGEX
MatchRegexp,
/// MATCH_PHRASE_PREFIX
MatchPhrasePrefix,
/// MATCH_PHRASE_EDGE
MatchPhraseEdge,
/// String concatenation `||`
StringConcat,
/// Left parenthesis `(`
Expand Down Expand Up @@ -382,6 +394,12 @@ impl fmt::Display for Token {
Token::QuestionAnd => write!(f, "?&"),
Token::QuestionPipe => write!(f, "?|"),
Token::CustomBinaryOperator(s) => f.write_str(s),
Token::MatchAll => write!(f, "MATCH_ALL"),
Token::MatchAny => write!(f, "MATCH_ANY"),
Token::MatchPhrase => write!(f, "MATCH_PHRASE"),
Token::MatchRegexp => write!(f, "MATCH_REGEXP"),
Token::MatchPhraseEdge => write!(f, "MATCH_PHRASE_EDGE"),
Token::MatchPhrasePrefix => write!(f, "MATCH_PHRASE_PREFIX"),
}
}
}
Expand Down Expand Up @@ -3942,4 +3960,42 @@ mod tests {
],
);
}

#[test]
fn test_doris_match_phrase_operator() {
let dialect = MySqlDialect {};

for symbol in [
"MATCH_ALL",
"MATCH_ANY",
"MATCH_PHRASE",
"MATCH_REGEXP",
"MATCH_PHRASE_PREFIX",
"MATCH_PHRASE_EDGE",
] {
let sql = format!(
"SELECT * FROM table_name WHERE content {} 'keyword1 keyword2 ~3'",
symbol
);
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mul,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("table_name", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("content", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword(symbol),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("keyword1 keyword2 ~3".into()),
];
compare(expected, tokens);
}
}
}
21 changes: 21 additions & 0 deletions tests/sqlparser_mysql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3418,3 +3418,24 @@ fn parse_cast_integers() {
.run_parser_method("CAST(foo AS UNSIGNED INTEGER(3))", |p| p.parse_expr())
.expect_err("CAST doesn't allow display width");
}

#[test]
fn parse_extend_match_operator() {
mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar'");
mysql().verified_expr("foo LIKE 'apple' OR foo MATCH_ALL 'bar' OR foo LIKE 'cherry'");
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo LIKE 'cherry'");
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo LIKE 'apple' OR foo MATCH_ALL 'cherry'");
mysql().verified_expr("foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple' OR foo MATCH_ALL 'cherry'");

mysql().verified_stmt("SELECT * FROM table_name WHERE foo MATCH_ALL 'bar'");
mysql().verified_stmt(
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple'",
);
mysql().verified_stmt(
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' OR foo MATCH_ALL 'apple'",
);
mysql().verified_stmt("SELECT * FROM table_name WHERE foo LIKE 'apple' OR foo MATCH_ALL 'bar'");
mysql().verified_stmt(
"SELECT * FROM table_name WHERE foo MATCH_ALL 'bar' AND foo MATCH_ALL 'apple' OR a BETWEEN 1 AND 2",
);
}