datamol-io · jstlaurent · May 6, 2025 · Apr 30, 2025 · Apr 30, 2025
@@ -47,9 +47,9 @@ jobs:
           python -c "import poptorch"
 
           # Download the datafiles (Total ~ 10Mb - nothing compared to the libraries)
-          wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
-          wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
-          wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Small-dataset/qm9.csv.gz
+          wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Small-dataset/ZINC12k.csv.gz
+          wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Small-dataset/Tox21-7k-12-labels.csv.gz
+          wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Small-dataset/qm9.csv.gz
 
 
           # Install the IPU specific and graphium requirements

@@ -64,8 +64,8 @@
     "# download from https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv\n",
     "# data = pd.read_csv(\"/home/hadim/250k_rndm_zinc_drugs_clean_3.csv\", usecols=[\"smiles\"])\n",
     "\n",
-    "# download from https://storage.googleapis.com/graphium-public/datasets/QM9/norm_qm9.csv\n",
-    "data = pd.read_csv(\"https://storage.googleapis.com/graphium-public/datasets/QM9/norm_qm9.csv\", usecols=[\"smiles\"])"
+    "# download from https://storage.valencelabs.com/graphium/datasets/QM9/norm_qm9.csv\n",
+    "data = pd.read_csv(\"https://storage.valencelabs.com/graphium/datasets/QM9/norm_qm9.csv\", usecols=[\"smiles\"])"
    ]
   },
   {

@@ -65,8 +65,8 @@
     "# download from https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv\n",
     "# data = pd.read_csv(\"/home/hadim/250k_rndm_zinc_drugs_clean_3.csv\", usecols=[\"smiles\"])\n",
     "\n",
-    "# download from https://storage.googleapis.com/graphium-public/datasets/QM9/norm_qm9.csv\n",
-    "data = pd.read_csv(\"https://storage.googleapis.com/graphium-public/datasets/QM9/norm_qm9.csv\", usecols=[\"smiles\"])"
+    "# download from https://storage.valencelabs.com/graphium/datasets/QM9/norm_qm9.csv\n",
+    "data = pd.read_csv(\"https://storage.valencelabs.com/graphium/datasets/QM9/norm_qm9.csv\", usecols=[\"smiles\"])"
    ]
   },
   {

@@ -66,12 +66,12 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: ~/scratch/data/graphium/data/PCQM4M/pcqm4mv2-20k.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
+        # wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
+        # or set path as https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
         smiles_col: "cxsmiles"
         label_cols: ["homo_lumo_gap"]
         # sample_size: 30000 # use sample_size for test
-        # splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        # splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
         split_val: 0.1
         split_test: 0.1
 

@@ -15,12 +15,12 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: ~/scratch/data/graphium/data/PCQM4M/pcqm4mv2-20k.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
+        # wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
+        # or set path as https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
         smiles_col: "cxsmiles"
         label_cols: ["homo_lumo_gap"]
         # sample_size: 30000 # use sample_size for test
-        # splits_path: graphium/data/PCQM4Mv2/split_dict.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict.pt`
+        # splits_path: graphium/data/PCQM4Mv2/split_dict.pt  # Download with `wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/split_dict.pt`
         split_val: 0.1
         split_test: 0.1
 

@@ -67,8 +67,8 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: graphium/data/b3lyp/b3lyp_mini.parquet #graphium/data/b3lyp/b3lyp_mini.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/b3lyp/b3lyp_mini.parquet
-        # or set path as https://storage.googleapis.com/graphium-public/datasets/b3lyp/b3lyp_mini.parquet directly
+        # wget https://storage.valencelabs.com/graphium/datasets/b3lyp/b3lyp_mini.parquet
+        # or set path as https://storage.valencelabs.com/graphium/datasets/b3lyp/b3lyp_mini.parquet directly
         smiles_col: "smiles"
         label_cols: ["beta_gap"]
         # sample_size: 30000 # use sample_size for test
@@ -78,12 +78,12 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: graphium/data/b3lyp/b3lyp_mini.parquet #graphium/data/b3lyp/b3lyp_mini.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/b3lyp/b3lyp_mini.parquet
-        # or set path as https://storage.googleapis.com/graphium-public/datasets/b3lyp/b3lyp_mini.parquet directly
+        # wget https://storage.valencelabs.com/graphium/datasets/b3lyp/b3lyp_mini.parquet
+        # or set path as https://storage.valencelabs.com/graphium/datasets/b3lyp/b3lyp_mini.parquet directly
         smiles_col: "smiles"
         label_cols: ["alpha_gap"]
         # sample_size: 30000 # use sample_size for test
-        # splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        # splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
         split_val: 0.1
         split_test: 0.1
 

@@ -15,12 +15,12 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: graphium/data/PCQM4M/pcqm4mv2-20k.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
+        # wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv
+        # or set path as https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2-20k.csv directly
         smiles_col: "cxsmiles"
         label_cols: ["homo_lumo_gap"]
         # sample_size: 6000 # use sample_size for test
-        splits_path: graphium/data/PCQM4Mv2/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        splits_path: graphium/data/PCQM4Mv2/split_dict_v2.pt  # Download with `wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
         # graphium/data/PCQM4Mv2/split_dict.pt
         # graphium/data/PCQM4Mv2/pcqm4m_split.csv
         split_names: ["train", "valid", "test-dev"]

@@ -38,12 +38,12 @@ datamodule:
       l1000_mcf7:
         df: null
         df_path: ../data/graphium/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
@@ -38,12 +38,12 @@ datamodule:
       l1000_vcap:
         df: null
         df_path: ../data/graphium/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
@@ -85,52 +85,52 @@ datamodule:
       l1000_vcap:
         df: null
         df_path: ../data/graphium/large-dataset/LINCS_L1000_VCAP_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/LINCS_L1000_VCAP_0-4.csv.gz
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/l1000_vcap_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/l1000_vcap_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
 
       l1000_mcf7:
         df: null
         df_path: ../data/graphium/large-dataset/LINCS_L1000_MCF7_0-2_th2.csv.gz
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/LINCS_L1000_MCF7_0-4.csv.gz
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: geneID-*  # geneID-* means all columns starting with "geneID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/l1000_mcf7_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/l1000_mcf7_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
 
       pcba_1328:
         df: null
         df_path: ../data/graphium/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
 
       pcqm4m_g25:
         df: null
         df_path: ../data/graphium/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
         # or set path as the URL directly
         smiles_col: "ordered_smiles"
         label_cols: graph_*  # graph_* means all columns starting with "graph_"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
         # split_names: [train, val, test_seen]
         label_normalization:
           normalize_val_test: True
@@ -140,13 +140,13 @@ datamodule:
       pcqm4m_n4:
         df: null
         df_path: ../data/graphium/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
         # or set path as the URL directly
         smiles_col: "ordered_smiles"
         label_cols: node_* # node_* means all columns starting with "node_"
         # sample_size: 2000 # use sample_size for test
         task_level: node
-        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
         # split_names: [train, val, test_seen]
         seed: 42
         label_normalization:

@@ -30,12 +30,12 @@ datamodule:
       pcba_1328:
         df: null
         df_path: ../data/graphium/large-dataset/PCBA_1328_1564k.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCBA_1328_1564k.parquet
         # or set path as the URL directly
         smiles_col: "SMILES"
         label_cols: assayID-*  # assayID-* means all columns starting with "assayID-"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcba_1328_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcba_1328_random_splits.pt`
         # split_names: [train, val, test_seen]
         epoch_sampling_fraction: 1.0
@@ -32,12 +32,12 @@ datamodule:
         df: null
         task_level: "graph"
         df_path: graphium/data/PCQM4M/pcqm4mv2.csv
-        # wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
-        # or set path as https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
+        # wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv
+        # or set path as https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/pcqm4mv2.csv directly
         smiles_col: "cxsmiles"
         label_cols: ["homo_lumo_gap"]
         # sample_size: 8000 # use sample_size for test
-        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.googleapis.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
+        splits_path: graphium/data/PCQM4M/split_dict_v2.pt  # Download with `wget https://storage.valencelabs.com/datasets-public-research/PCQM4M/cxsmiles/split_dict_v2.pt`
         split_names: ["train", "valid", "test-dev"]
         # graphium/data/PCQM4Mv2/split_dict.pt
         # graphium/data/PCQM4Mv2/pcqm4m_split.csv

@@ -32,13 +32,13 @@ datamodule:
       pcqm4m_g25:
         df: null
         df_path: ../data/graphium/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
         # or set path as the URL directly
         smiles_col: "ordered_smiles"
         label_cols: graph_*  # graph_* means all columns starting with "graph_"
         # sample_size: 2000 # use sample_size for test
         task_level: graph
-        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
         # split_names: [train, val, test_seen]
         label_normalization:
           normalize_val_test: True

@@ -30,13 +30,13 @@ datamodule:
       pcqm4m_n4:
         df: null
         df_path: ../data/graphium/large-dataset/PCQM4M_G25_N4.parquet
-        # wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
+        # wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/PCQM4M_G25_N4.parquet
         # or set path as the URL directly
         smiles_col: "ordered_smiles"
         label_cols: node_* # node_* means all columns starting with "node_"
         # sample_size: 2000 # use sample_size for test
         task_level: node
-        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.googleapis.com/graphium-public/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
+        splits_path: ../data/graphium/large-dataset/pcqm4m_g25_n4_random_splits.pt  # Download with `wget https://storage.valencelabs.com/graphium/datasets/neurips_2023/Large-dataset/pcqm4m_g25_n4_random_splits.pt`
         # split_names: [train, val, test_seen]
         seed: 42
         label_normalization: