Skip to content

Commit 5eb993f

Browse files
authored
Merge pull request #4741 from continuedev/pe/global-docs-cache
feat: use global docs cache
2 parents a2634fb + 849a5ea commit 5eb993f

File tree

18 files changed

+443
-769
lines changed

18 files changed

+443
-769
lines changed

CONTRIBUTING.md

-5
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
- [Writing Context Providers](#writing-context-providers)
2727
- [Adding an LLM Provider](#adding-an-llm-provider)
2828
- [Adding Models](#adding-models)
29-
- [Adding Pre-indexed Documentation](#adding-pre-indexed-documentation)
3029
- [📐 Continue Architecture](#-continue-architecture)
3130
- [Continue VS Code Extension](#continue-vs-code-extension)
3231
- [Continue JetBrains Extension](#continue-jetbrains-extension)
@@ -212,10 +211,6 @@ While any model that works with a supported provider can be used with Continue,
212211
- LLM Providers: Since many providers use their own custom strings to identify models, you'll have to add the translation from Continue's model name (the one you added to `index.d.ts`) and the model string for each of these providers: [Ollama](./core/llm/llms/Ollama.ts), [Together](./core/llm/llms/Together.ts), and [Replicate](./core/llm/llms/Replicate.ts). You can find their full model lists here: [Ollama](https://ollama.ai/library), [Together](https://docs.together.ai/docs/inference-models), [Replicate](https://replicate.com/collections/streaming-language-models).
213212
- [Prompt Templates](./core/llm/index.ts) - In this file you'll find the `autodetectTemplateType` function. Make sure that for the model name you just added, this function returns the correct template type. This is assuming that the chat template for that model is already built in Continue. If not, you will have to add the template type and corresponding edit and chat templates.
214213

215-
### Adding Pre-indexed Documentation
216-
217-
Continue's @docs context provider lets you easily reference entire documentation sites and then uses embeddings to add the most relevant pages to context. To make the experience as smooth as possible, we pre-index many of the most popular documentation sites. If you'd like to add new documentation to this list, just add an object to the list in [preIndexedDocs.ts](./core/indexing/docs/preIndexedDocs.ts). `startUrl` is where the crawler will start and `rootUrl` will filter out any pages not on that site and under the path of `rootUrl`.
218-
219214
## 📐 Continue Architecture
220215

221216
Continue consists of 2 parts that are split so that it can be extended to work in other IDEs as easily as possible:

core/.eslintrc.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"eqeqeq": "error",
1515
"complexity": ["error", { "max": 38 }],
1616
"max-lines-per-function": ["error", { "max": 996 }],
17-
"max-statements": ["error", { "max": 112 }],
17+
"max-statements": ["error", { "max": 114 }],
1818
"max-depth": ["error", { "max": 6 }],
1919
"max-nested-callbacks": ["error", { "max": 4 }],
2020
"max-params": ["error", { "max": 11 }]

core/context/providers/DocsContextProvider.ts

+9-45
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import {
99
LoadSubmenuItemsArgs,
1010
} from "../..";
1111
import DocsService from "../../indexing/docs/DocsService";
12-
import preIndexedDocs from "../../indexing/docs/preIndexedDocs";
1312

1413
import { INSTRUCTIONS_BASE_ITEM } from "./utils";
1514

@@ -64,23 +63,12 @@ class DocsContextProvider extends BaseContextProvider {
6463
return chunksCopy;
6564
}
6665

67-
private _sortByPreIndexedDocs(
66+
private _sortAlphabetically(
6867
submenuItems: ContextSubmenuItem[],
6968
): ContextSubmenuItem[] {
70-
// Sort submenuItems such that the objects with titles which don't occur in configs occur first, and alphabetized
69+
// Sort submenu items alphabetically by title
7170
return submenuItems.sort((a, b) => {
72-
const aTitleInConfigs = a.metadata?.preIndexed ?? false;
73-
const bTitleInConfigs = b.metadata?.preIndexed ?? false;
74-
75-
// Primary criterion: Items not in configs come first
76-
if (!aTitleInConfigs && bTitleInConfigs) {
77-
return -1;
78-
} else if (aTitleInConfigs && !bTitleInConfigs) {
79-
return 1;
80-
} else {
81-
// Secondary criterion: Alphabetical order when both items are in the same category
82-
return a.title.toString().localeCompare(b.title.toString());
83-
}
71+
return a.title.toString().localeCompare(b.title.toString());
8472
});
8573
}
8674

@@ -165,46 +153,22 @@ class DocsContextProvider extends BaseContextProvider {
165153
}
166154
await docsService.isInitialized;
167155

168-
// Create map of docs url -> submenu item
169-
const submenuItemsMap = new Map<string, ContextSubmenuItem>();
156+
// Create an array to hold submenu items
157+
const submenuItems: ContextSubmenuItem[] = [];
170158

171-
// Add custom docs from config
159+
// Get all indexed docs from the database
172160
const docs = (await docsService.listMetadata()) ?? [];
173161
for (const { startUrl, title, favicon } of docs) {
174-
submenuItemsMap.set(startUrl, {
162+
submenuItems.push({
175163
title,
176164
id: startUrl,
177165
description: new URL(startUrl).hostname,
178166
icon: favicon,
179167
});
180168
}
181169

182-
// Add pre-indexed docs if supported
183-
const canUsePreindexedDocs = await docsService.canUsePreindexedDocs();
184-
if (canUsePreindexedDocs) {
185-
for (const { startUrl, title } of Object.values(preIndexedDocs)) {
186-
// Skip if overridden in config
187-
if (docs.find((d) => d.startUrl === startUrl)) {
188-
continue;
189-
}
190-
submenuItemsMap.set(startUrl, {
191-
title,
192-
id: startUrl,
193-
description: new URL(startUrl).hostname,
194-
metadata: {
195-
preIndexed: true,
196-
},
197-
});
198-
}
199-
}
200-
201-
// Create array and sort if pre-indexed is supported
202-
const submenuItems = Array.from(submenuItemsMap.values());
203-
if (canUsePreindexedDocs) {
204-
return this._sortByPreIndexedDocs(submenuItems);
205-
}
206-
207-
return submenuItems;
170+
// Sort alphabetically
171+
return this._sortAlphabetically(submenuItems);
208172
}
209173
}
210174

core/index.d.ts

-2
Original file line numberDiff line numberDiff line change
@@ -236,15 +236,13 @@ export interface SiteIndexingConfig {
236236
maxDepth?: number;
237237
faviconUrl?: string;
238238
useLocalCrawling?: boolean;
239-
rootUrl?: string; // Currently only used by preindexed docs
240239
}
241240

242241
export interface DocsIndexingDetails {
243242
startUrl: string;
244243
config: SiteIndexingConfig;
245244
indexingStatus: IndexingStatus | undefined;
246245
chunks: Chunk[];
247-
isPreIndexedDoc: boolean;
248246
}
249247

250248
export interface IContextProvider {

core/indexing/docs/DocsCache.test.ts

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import OpenAI from "../../llm/llms/OpenAI";
2+
import { DocsCache } from "./DocsCache"; // adjust import path as needed
3+
4+
describe("DocsCache", () => {
5+
let openAIEmbeddings: OpenAI;
6+
7+
beforeAll(() => {
8+
openAIEmbeddings = new OpenAI({
9+
apiKey: "",
10+
model: "text-embedding-ada-002",
11+
});
12+
});
13+
14+
test("normalizeEmbeddingId() produces a valid ID without constructor name", async () => {
15+
// Get the embedding ID from OpenAI embeddings provider
16+
const embeddingId = DocsCache.normalizeEmbeddingId(
17+
openAIEmbeddings.embeddingId,
18+
);
19+
20+
// The ID should not contain the constructor name (OpenAI)
21+
expect(embeddingId).toEqual(
22+
`${openAIEmbeddings.model}::${openAIEmbeddings.maxEmbeddingChunkSize}`,
23+
);
24+
});
25+
});

core/indexing/docs/DocsCache.ts

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import request from "request";
2+
import { Chunk } from "../..";
3+
4+
export interface SiteIndexingResults {
5+
chunks: (Chunk & { embedding: number[] })[];
6+
url: string;
7+
title: string;
8+
}
9+
10+
export class DocsCache {
11+
static readonly AWS_REGION: string = "us-west-1";
12+
static readonly BUCKET_NAME: string = "continue-preindexed-docs";
13+
14+
/**
15+
* Normalizes an embedding ID by stripping the constructor name part.
16+
* This is done because we don't care about the provider, just the
17+
* model and the max embedding chunk size.
18+
*/
19+
static normalizeEmbeddingId(embeddingId: string): string {
20+
// Split by "::" and remove the first part (constructor name)
21+
const parts = embeddingId.split("::");
22+
if (parts.length <= 1) return embeddingId; // Return original if no "::" found
23+
24+
// Return everything except the first part, joining with "::"
25+
return parts.slice(1).join("::");
26+
}
27+
28+
/**
29+
* Gets the filepath for a given embedding ID and URL
30+
*/
31+
static getFilepathForEmbeddingIdAndUrl(
32+
embeddingId: string,
33+
url: string,
34+
): string {
35+
const normalizedEmbeddingId = DocsCache.normalizeEmbeddingId(embeddingId);
36+
const normalizedUrl = encodeURIComponent(url.replace(/\//g, "_"));
37+
return normalizedEmbeddingId + "/" + normalizedUrl;
38+
}
39+
40+
/**
41+
* Gets the fully qualified S3 URL for a given filepath
42+
*/
43+
private static getS3Url(filepath: string): string {
44+
const pathname = filepath.split("/").map(encodeURIComponent).join("/");
45+
return `https://${this.BUCKET_NAME}.s3.${this.AWS_REGION}.amazonaws.com/${pathname}`;
46+
}
47+
48+
/**
49+
* Downloads cached site indexing results from S3 for a given embedding ID and URL
50+
* @param embeddingId The embedding ID
51+
* @param url The URL of the document
52+
* @returns The downloaded data as a string
53+
*/
54+
static async getDocsCacheForUrl(
55+
embeddingId: string,
56+
url: string,
57+
): Promise<string> {
58+
const filepath = DocsCache.getFilepathForEmbeddingIdAndUrl(
59+
embeddingId,
60+
url,
61+
);
62+
63+
return new Promise<string>((resolve, reject) => {
64+
let data = "";
65+
const url = this.getS3Url(filepath);
66+
const download = request({
67+
url,
68+
});
69+
70+
download.on("response", (response: any) => {
71+
if (response.statusCode !== 200) {
72+
reject(
73+
new Error("There was an error retrieving the pre-indexed doc"),
74+
);
75+
}
76+
});
77+
78+
download.on("error", (err: any) => reject(err));
79+
download.on("data", (chunk: any) => (data += chunk));
80+
download.on("end", () => resolve(data));
81+
});
82+
}
83+
}

0 commit comments

Comments
 (0)