Skip to content

Commit 37d39b0

Browse files
committed
database/delete-projects: expand scope
1 parent 4126e2c commit 37d39b0

File tree

5 files changed

+105
-30
lines changed

5 files changed

+105
-30
lines changed

src/packages/database/postgres-server-queries.coffee

+2-2
Original file line numberDiff line numberDiff line change
@@ -2591,8 +2591,8 @@ exports.extend_PostgreSQL = (ext) -> class PostgreSQL extends ext
25912591
return await unlink_old_deleted_projects(@)
25922592

25932593
# async function
2594-
cleanup_old_projects_data: () =>
2595-
return await cleanup_old_projects_data(@)
2594+
cleanup_old_projects_data: (max_run_m) =>
2595+
return await cleanup_old_projects_data(@, max_run_m)
25962596

25972597
# async function
25982598
unlist_all_public_paths: (account_id, is_owner) =>

src/packages/database/postgres/bulk-delete.test.ts

+3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
* License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
44
*/
55

6+
// see packages/database/pool/pool.ts for where this name is also hard coded:
7+
process.env.PGDATABASE = "smc_ephemeral_testing_database";
8+
69
import getPool, { initEphemeralDatabase } from "@cocalc/database/pool";
710
import { uuid } from "@cocalc/util/misc";
811
import { bulk_delete } from "./bulk-delete";

src/packages/database/postgres/bulk-delete.ts

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
// see packages/database/pool/pool.ts for where this name is also hard coded:
2-
process.env.PGDATABASE = "smc_ephemeral_testing_database";
3-
41
import { escapeIdentifier } from "pg";
52

63
import getPool from "@cocalc/database/pool";
74
import { SCHEMA } from "@cocalc/util/schema";
85

6+
type Field =
7+
| "project_id"
8+
| "account_id"
9+
| "target_project_id"
10+
| "source_project_id";
11+
912
interface Opts {
1013
table: string; // e.g. project_log, etc.
11-
field: "project_id" | "account_id"; // for now, we only support a few
14+
field: Field; // for now, we only support a few
1215
id?: string; // default "id", the ID field in the table, which identifies each row uniquely
1316
value: string; // a UUID
1417
limit?: number; // default 1024

src/packages/database/postgres/delete-projects.ts

+91-23
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ Code related to permanently deleting projects.
99

1010
import getLogger from "@cocalc/backend/logger";
1111
import getPool from "@cocalc/database/pool";
12-
import { callback2 } from "@cocalc/util/async-utils";
13-
import { PostgreSQL } from "./types";
14-
import { minutes_ago } from "@cocalc/util/misc";
1512
import { getServerSettings } from "@cocalc/database/settings";
13+
import { callback2 } from "@cocalc/util/async-utils";
1614
import { KUCALC_ON_PREMISES } from "@cocalc/util/db-schema/site-defaults";
15+
import { minutes_ago } from "@cocalc/util/misc";
16+
import { bulk_delete } from "./bulk-delete";
17+
import { PostgreSQL } from "./types";
1718

1819
const log = getLogger("db:delete-projects");
1920

@@ -59,8 +60,9 @@ async function get_account_id(
5960
}
6061

6162
/*
62-
This deletes all projects older than the given number of days, from the perspective of a user.
63-
Another task has to run to actually get rid of the data, etc.
63+
This removes all users from all projects older than the given number of days and marked as deleted.
64+
In particular, users are no longer able to access that project.
65+
The "cleanup_old_projects_data" function has to run to actually get rid of the data, etc.
6466
*/
6567
export async function unlink_old_deleted_projects(
6668
db: PostgreSQL,
@@ -70,7 +72,7 @@ export async function unlink_old_deleted_projects(
7072
query: "UPDATE projects",
7173
set: { users: null },
7274
where: [
73-
"deleted = true",
75+
"deleted = true",
7476
"users IS NOT NULL",
7577
`last_edited <= NOW() - '${age_d} days'::INTERVAL`,
7678
],
@@ -83,27 +85,32 @@ FROM projects as p
8385
INNER JOIN syncstrings as s
8486
ON p.project_id = s.project_id
8587
WHERE p.deleted = true
88+
AND users IS NULL
8689
AND p.state ->> 'state' != 'deleted'
90+
ORDER BY
91+
p.project_id, s.string_id
8792
`;
8893

8994
/*
90-
This is more thorough than the above. It issues actual delete operations on data of projects marked as deleted.
95+
This more thorough delete procedure comes after the above.
96+
It issues actual delete operations on data of projects marked as deleted.
9197
When done, it sets the state.state to "deleted".
9298
9399
The operations involves deleting all syncstrings of that project (and associated with that, patches),
94-
and only for on-prem setups, it also deletes all the data stored in the project on disk.
100+
and only for on-prem setups, it also deletes all the data stored in the project on disk and various tables.
95101
96-
This function is called every couple of hours. Hence ensure it does not run longer than the given max_run_m time (minutes)
102+
This function is called every couple of hours. Hence it checks to not run longer than the given max_run_m time (minutes).
97103
*/
98104
export async function cleanup_old_projects_data(
99105
db: PostgreSQL,
100-
delay_ms = 50,
101106
max_run_m = 60,
102107
) {
103108
const settings = await getServerSettings();
104109
const on_prem = settings.kucalc === KUCALC_ON_PREMISES;
110+
const L0 = log.extend("cleanup_old_projects_data");
111+
const L = L0.debug;
105112

106-
log.debug("cleanup_old_projects_data", { delay_ms, max_run_m, on_prem });
113+
log.debug("cleanup_old_projects_data", { max_run_m, on_prem });
107114
const start_ts = new Date();
108115

109116
const pool = getPool();
@@ -115,34 +122,95 @@ export async function cleanup_old_projects_data(
115122
for (const row of rows) {
116123
const { project_id, string_id } = row;
117124
if (start_ts < minutes_ago(max_run_m)) {
118-
log.debug(
119-
`cleanup_old_projects_data: too much time elapsed, breaking after ${num} syncstrings`,
120-
);
125+
L(`too much time elapsed, breaking after ${num} syncstrings`);
121126
break;
122127
}
123128

124-
log.debug(
125-
`cleanup_old_projects_data: deleting syncstring ${project_id}/${string_id}`,
126-
);
129+
L(`deleting syncstring ${project_id}/${string_id}`);
127130
num += 1;
128131
await callback2(db.delete_syncstring, { string_id });
129132

130-
// wait for the given amount of delay_ms millio seconds
131-
await new Promise((done) => setTimeout(done, delay_ms));
133+
// wait a bit after deleting syncstrings, e.g. to let the standby db catch up
134+
await new Promise((done) => setTimeout(done, 100));
132135

136+
// Q_CLEANUP_SYNCSTRINGS orders by project_id, hence we trigger project specific actions when the id changes
133137
if (pid != project_id) {
134138
pid = project_id;
139+
const L2 = L0.extend(project_id).debug;
140+
135141
if (on_prem) {
136-
log.debug(
137-
`cleanup_old_projects_data: deleting project data in ${project_id}`,
138-
);
142+
L2(`cleanup_old_projects_data for project_id=${project_id}`);
139143
// TODO: this only works on-prem, and requires the project files to be mounted
140144

141-
log.debug(`deleting all shared files in project ${project_id}`);
145+
L2(`deleting all shared files in project ${project_id}`);
142146
// TODO: do it directly like above, and also get rid of all those shares in the database
147+
148+
const delPublicPaths = await bulk_delete({
149+
table: "public_paths",
150+
field: "project_id",
151+
value: project_id,
152+
});
153+
L2(`deleted public_paths ${delPublicPaths.rowsDeleted} entries`);
154+
155+
const delProjectLog = await bulk_delete({
156+
table: "project_log",
157+
field: "project_id",
158+
value: project_id,
159+
});
160+
L2(`deleted project_log ${delProjectLog.rowsDeleted} entries`);
161+
162+
const delFileUse = await bulk_delete({
163+
table: "file_use",
164+
field: "project_id",
165+
value: project_id,
166+
});
167+
L2(`deleted file_use ${delFileUse.rowsDeleted} entries`);
168+
169+
const delAccessLog = await bulk_delete({
170+
table: "file_access_log",
171+
field: "project_id",
172+
value: project_id,
173+
});
174+
L2(`deleted file_access_log ${delAccessLog.rowsDeleted} entries`);
175+
176+
const delJupyterApiLog = await bulk_delete({
177+
table: "jupyter_api_log",
178+
field: "project_id",
179+
value: project_id,
180+
});
181+
L2(`deleted jupyter_api_log ${delJupyterApiLog.rowsDeleted} entries`);
182+
183+
for (const field of [
184+
"target_project_id",
185+
"source_project_id",
186+
] as const) {
187+
const delCopyPaths = await bulk_delete({
188+
table: "copy_paths",
189+
field,
190+
value: project_id,
191+
});
192+
L2(`deleted copy_paths/${field} ${delCopyPaths.rowsDeleted} entries`);
193+
}
194+
195+
const delListings = await bulk_delete({
196+
table: "listings",
197+
field: "project_id",
198+
id: "project_id", // TODO listings has a more complex ID, is this a problem?
199+
value: project_id,
200+
});
201+
L2(`deleted ${delListings.rowsDeleted} listings`);
202+
203+
const delInviteTokens = await bulk_delete({
204+
table: "project_invite_tokens",
205+
field: "project_id",
206+
value: project_id,
207+
id: "token",
208+
});
209+
L2(`deleted ${delInviteTokens.rowsDeleted} entries`);
143210
}
144211

145212
// now, that we're done with that project, mark it as state.state ->> 'deleted'
213+
// in addition to the flag "deleted = true"
146214
await callback2(db.set_project_state, {
147215
project_id,
148216
state: "deleted",

src/packages/hub/run/delete-projects.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ async function update() {
1616
console.log("unlinking old deleted projects...");
1717
try {
1818
await db.unlink_old_deleted_projects();
19-
await db.cleanup_old_projects_data();
19+
const max_run_m = (INTERVAL_MS / 2) / (1000 * 60)
20+
await db.cleanup_old_projects_data(max_run_m);
2021
} catch (err) {
2122
if (err !== null) {
2223
throw Error(`failed to unlink projects -- ${err}`);

0 commit comments

Comments
 (0)