Skip to content

Commit 6157f57

Browse files
hsiong李远军yourchanges
authored
feat: Add OceanBase hybrid search features (#16652)
Co-authored-by: 李远军 <[email protected]> Co-authored-by: yourchanges <[email protected]>
1 parent c4bb071 commit 6157f57

File tree

7 files changed

+121
-22
lines changed

7 files changed

+121
-22
lines changed

api/.env.example

+1
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ OCEANBASE_VECTOR_USER=root@test
297297
OCEANBASE_VECTOR_PASSWORD=difyai123456
298298
OCEANBASE_VECTOR_DATABASE=test
299299
OCEANBASE_MEMORY_LIMIT=6G
300+
OCEANBASE_ENABLE_HYBRID_SEARCH=false
300301

301302
# openGauss configuration
302303
OPENGAUSS_HOST=127.0.0.1

api/configs/middleware/vdb/oceanbase_config.py

+6
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ class OceanBaseVectorConfig(BaseSettings):
3333
description="Name of the OceanBase Vector database to connect to",
3434
default=None,
3535
)
36+
37+
OCEANBASE_ENABLE_HYBRID_SEARCH: bool = Field(
38+
description="Enable hybrid search features (requires OceanBase >= 4.3.5.1). Set to false for compatibility "
39+
"with older versions",
40+
default=False,
41+
)

api/controllers/console/datasets/datasets.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,6 @@ def get(self):
646646
| VectorType.BAIDU
647647
| VectorType.VIKINGDB
648648
| VectorType.UPSTASH
649-
| VectorType.OCEANBASE
650649
):
651650
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
652651
case (
@@ -664,6 +663,7 @@ def get(self):
664663
| VectorType.COUCHBASE
665664
| VectorType.MILVUS
666665
| VectorType.OPENGAUSS
666+
| VectorType.OCEANBASE
667667
):
668668
return {
669669
"retrieval_method": [
@@ -692,7 +692,6 @@ def get(self, vector_type):
692692
| VectorType.BAIDU
693693
| VectorType.VIKINGDB
694694
| VectorType.UPSTASH
695-
| VectorType.OCEANBASE
696695
):
697696
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
698697
case (
@@ -708,6 +707,7 @@ def get(self, vector_type):
708707
| VectorType.PGVECTOR
709708
| VectorType.LINDORM
710709
| VectorType.OPENGAUSS
710+
| VectorType.OCEANBASE
711711
):
712712
return {
713713
"retrieval_method": [

api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py

+100-16
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class OceanBaseVectorConfig(BaseModel):
3131
user: str
3232
password: str
3333
database: str
34+
enable_hybrid_search: bool = False
3435

3536
@model_validator(mode="before")
3637
@classmethod
@@ -57,6 +58,7 @@ def __init__(self, collection_name: str, config: OceanBaseVectorConfig):
5758
password=self._config.password,
5859
db_name=self._config.database,
5960
)
61+
self._hybrid_search_enabled = self._check_hybrid_search_support() # Check if hybrid search is supported
6062

6163
def get_type(self) -> str:
6264
return VectorType.OCEANBASE
@@ -98,6 +100,16 @@ def _create_collection(self) -> None:
98100
columns=cols,
99101
vidxs=vidx_params,
100102
)
103+
try:
104+
if self._hybrid_search_enabled:
105+
self._client.perform_raw_text_sql(f"""ALTER TABLE {self._collection_name}
106+
ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER ik""")
107+
except Exception as e:
108+
raise Exception(
109+
"Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above "
110+
+ "to support fulltext index and vector index in the same table",
111+
e,
112+
)
101113
vals = []
102114
params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'")
103115
for row in params:
@@ -116,6 +128,27 @@ def _create_collection(self) -> None:
116128
)
117129
redis_client.set(collection_exist_cache_key, 1, ex=3600)
118130

131+
def _check_hybrid_search_support(self) -> bool:
132+
"""
133+
Check if the current OceanBase version supports hybrid search.
134+
Returns True if the version is >= 4.3.5.1, otherwise False.
135+
"""
136+
if not self._config.enable_hybrid_search:
137+
return False
138+
139+
try:
140+
from packaging import version
141+
142+
# return OceanBase_CE 4.3.5.1 (r101000042025031818-bxxxx) (Built Mar 18 2025 18:13:36)
143+
result = self._client.perform_raw_text_sql("SELECT @@version_comment AS version")
144+
ob_full_version = result.fetchone()[0]
145+
ob_version = ob_full_version.split()[1]
146+
logger.debug("Current OceanBase version is %s", ob_version)
147+
return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version
148+
except Exception as e:
149+
logger.warning(f"Failed to check OceanBase version: {str(e)}. Disabling hybrid search.")
150+
return False
151+
119152
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
120153
ids = self._get_uuids(documents)
121154
for id, doc, emb in zip(ids, documents, embeddings):
@@ -130,7 +163,7 @@ def add_texts(self, documents: list[Document], embeddings: list[list[float]], **
130163
)
131164

132165
def text_exists(self, id: str) -> bool:
133-
cur = self._client.get(table_name=self._collection_name, id=id)
166+
cur = self._client.get(table_name=self._collection_name, ids=id)
134167
return bool(cur.rowcount != 0)
135168

136169
def delete_by_ids(self, ids: list[str]) -> None:
@@ -139,9 +172,12 @@ def delete_by_ids(self, ids: list[str]) -> None:
139172
self._client.delete(table_name=self._collection_name, ids=ids)
140173

141174
def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]:
175+
from sqlalchemy import text
176+
142177
cur = self._client.get(
143178
table_name=self._collection_name,
144-
where_clause=f"metadata->>'$.{key}' = '{value}'",
179+
ids=None,
180+
where_clause=[text(f"metadata->>'$.{key}' = '{value}'")],
145181
output_column_name=["id"],
146182
)
147183
return [row[0] for row in cur]
@@ -151,36 +187,84 @@ def delete_by_metadata_field(self, key: str, value: str) -> None:
151187
self.delete_by_ids(ids)
152188

153189
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
154-
return []
190+
if not self._hybrid_search_enabled:
191+
return []
192+
193+
try:
194+
top_k = kwargs.get("top_k", 5)
195+
if not isinstance(top_k, int) or top_k <= 0:
196+
raise ValueError("top_k must be a positive integer")
197+
198+
document_ids_filter = kwargs.get("document_ids_filter")
199+
where_clause = ""
200+
if document_ids_filter:
201+
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
202+
where_clause = f" AND metadata->>'$.document_id' IN ({document_ids})"
203+
204+
full_sql = f"""SELECT metadata, text, MATCH (text) AGAINST (:query) AS score
205+
FROM {self._collection_name}
206+
WHERE MATCH (text) AGAINST (:query) > 0
207+
{where_clause}
208+
ORDER BY score DESC
209+
LIMIT {top_k}"""
210+
211+
with self._client.engine.connect() as conn:
212+
with conn.begin():
213+
from sqlalchemy import text
214+
215+
result = conn.execute(text(full_sql), {"query": query})
216+
rows = result.fetchall()
217+
218+
docs = []
219+
for row in rows:
220+
metadata_str, _text, score = row
221+
try:
222+
metadata = json.loads(metadata_str)
223+
except json.JSONDecodeError:
224+
print(f"Invalid JSON metadata: {metadata_str}")
225+
metadata = {}
226+
metadata["score"] = score
227+
docs.append(Document(page_content=_text, metadata=metadata))
228+
229+
return docs
230+
except Exception as e:
231+
logger.warning(f"Failed to fulltext search: {str(e)}.")
232+
return []
155233

156234
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
157235
document_ids_filter = kwargs.get("document_ids_filter")
158-
where_clause = None
236+
_where_clause = None
159237
if document_ids_filter:
160238
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
161239
where_clause = f"metadata->>'$.document_id' in ({document_ids})"
240+
from sqlalchemy import text
241+
242+
_where_clause = [text(where_clause)]
162243
ef_search = kwargs.get("ef_search", self._hnsw_ef_search)
163244
if ef_search != self._hnsw_ef_search:
164245
self._client.set_ob_hnsw_ef_search(ef_search)
165246
self._hnsw_ef_search = ef_search
166247
topk = kwargs.get("top_k", 10)
167-
cur = self._client.ann_search(
168-
table_name=self._collection_name,
169-
vec_column_name="vector",
170-
vec_data=query_vector,
171-
topk=topk,
172-
distance_func=func.l2_distance,
173-
output_column_names=["text", "metadata"],
174-
with_dist=True,
175-
where_clause=where_clause,
176-
)
248+
try:
249+
cur = self._client.ann_search(
250+
table_name=self._collection_name,
251+
vec_column_name="vector",
252+
vec_data=query_vector,
253+
topk=topk,
254+
distance_func=func.l2_distance,
255+
output_column_names=["text", "metadata"],
256+
with_dist=True,
257+
where_clause=_where_clause,
258+
)
259+
except Exception as e:
260+
raise Exception("Failed to search by vector. ", e)
177261
docs = []
178-
for text, metadata, distance in cur:
262+
for _text, metadata, distance in cur:
179263
metadata = json.loads(metadata)
180264
metadata["score"] = 1 - distance / math.sqrt(2)
181265
docs.append(
182266
Document(
183-
page_content=text,
267+
page_content=_text,
184268
metadata=metadata,
185269
)
186270
)

docker/.env.example

+1
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ OCEANBASE_VECTOR_PASSWORD=difyai123456
554554
OCEANBASE_VECTOR_DATABASE=test
555555
OCEANBASE_CLUSTER_NAME=difyai
556556
OCEANBASE_MEMORY_LIMIT=6G
557+
OCEANBASE_ENABLE_HYBRID_SEARCH=false
557558

558559
# opengauss configurations, only available when VECTOR_STORE is `opengauss`
559560
OPENGAUSS_HOST=opengauss

docker/docker-compose-template.yaml

+5-2
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,8 @@ services:
373373

374374
# OceanBase vector database
375375
oceanbase:
376-
image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215
376+
image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818
377+
container_name: oceanbase
377378
profiles:
378379
- oceanbase
379380
restart: always
@@ -386,7 +387,9 @@ services:
386387
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
387388
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
388389
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
389-
OB_SERVER_IP: '127.0.0.1'
390+
MODE: MINI
391+
ports:
392+
- "${OCEANBASE_VECTOR_PORT:-2881}:2881"
390393

391394
# Oracle vector database
392395
oracle:

docker/docker-compose.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ x-shared-env: &shared-api-worker-env
252252
OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test}
253253
OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
254254
OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G}
255+
OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false}
255256
OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss}
256257
OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600}
257258
OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres}
@@ -804,7 +805,8 @@ services:
804805

805806
# OceanBase vector database
806807
oceanbase:
807-
image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215
808+
image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818
809+
container_name: oceanbase
808810
profiles:
809811
- oceanbase
810812
restart: always
@@ -817,7 +819,9 @@ services:
817819
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
818820
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
819821
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
820-
OB_SERVER_IP: '127.0.0.1'
822+
MODE: MINI
823+
ports:
824+
- "${OCEANBASE_VECTOR_PORT:-2881}:2881"
821825

822826
# Oracle vector database
823827
oracle:

0 commit comments

Comments
 (0)