reload_stat -> use_cache (#585)

Yusuke Oda · web-flow · commit b2d1a9102eff · 2022-10-27T10:43:46.000+09:00
diff --git a/data/reports/absa-confidence-report.json b/data/reports/absa-confidence-report.json
@@ -2,7 +2,6 @@
   "task_name": "aspect-based-sentiment-classification",
   "source_language": "en",
   "target_language": "en",
-  "reload_stat": true,
   "source_tokenizer": {
     "cls_name": "SingleSpaceTokenizer"
   },
@@ -1666,4 +1665,4 @@
       }
     ]
   }
-}
+}
diff --git a/data/reports/report_kg.json b/data/reports/report_kg.json
@@ -6,7 +6,6 @@
   "dataset_split": null,
   "source_language": null,
   "target_language": null,
-  "reload_stat": true,
   "is_print_case": true,
   "confidence_alpha": 0.05,
   "system_details": null,
diff --git a/explainaboard/explainaboard_main.py b/explainaboard/explainaboard_main.py
@@ -245,11 +245,10 @@ def create_parser():
     )
 
     parser.add_argument(
-        "--reload-stat",
-        type=str,
-        required=False,
-        default=None,
-        help="reload precomputed statistics over training set (if exists)",
+        "--no-use-cache",
+        dest="use_cache",
+        action="store_false",
+        help="Disable cached statistics over training set.",
     )
 
     parser.add_argument(
@@ -362,7 +361,7 @@ def main():
     """The main function to be executed."""
     args = create_parser().parse_args()
 
-    reload_stat: bool = False if args.reload_stat == "0" else True
+    use_cache: bool = args.use_cache
     system_outputs: list[str] = args.system_outputs
 
     reports: list[str] | None = args.reports
@@ -479,7 +478,6 @@ def load_system_details_path():
             "split_name": split,
             "source_language": source_language,
             "target_language": target_language,
-            "reload_stat": reload_stat,
             "confidence_alpha": args.confidence_alpha,
             "system_details": system_details,
             "custom_features": system_datasets[0].metadata.custom_features,
@@ -510,6 +508,7 @@ def load_system_details_path():
                 metadata=metadata_copied,
                 sys_output=system_dataset.samples,
                 skip_failed_analyses=args.skip_failed_analyses,
+                use_cache=use_cache,
             )
             reports.append(report)
 
diff --git a/explainaboard/info.py b/explainaboard/info.py
@@ -81,14 +81,12 @@ class SysOutputInfo(Serializable):
         dataset_split (str): the name of the split.
         source_language (str): the language of the input
         target_language (str): the language of the output
-        reload_stat (bool): whether to reload the statistics or not
         system_details (dict): a dictionary of system details
         source_tokenizer (Tokenizer): the tokenizer for source sentences
         target_tokenizer (Tokenizer): the tokenizer for target sentences
         analysis_levels: the levels of analysis to perform
     """
 
-    DEFAULT_RELOAD_STAT: ClassVar[bool] = True
     DEFAULT_CONFIDENCE_ALPHA: ClassVar[float] = 0.05
 
     task_name: str | None = None
@@ -98,7 +96,6 @@ class SysOutputInfo(Serializable):
     dataset_split: str | None = None
     source_language: str | None = None
     target_language: str | None = None
-    reload_stat: bool = DEFAULT_RELOAD_STAT
     # NOTE(odashi): confidence_alpha == None has a meaning beyond "unset": it prevents
     # calculating confidence intervals.
     confidence_alpha: float | None = DEFAULT_CONFIDENCE_ALPHA
@@ -182,7 +179,6 @@ def serialize(self) -> dict[str, SerializableData]:
             "dataset_split": self.dataset_split,
             "source_language": self.source_language,
             "target_language": self.target_language,
-            "reload_stat": self.reload_stat,
             "confidence_alpha": self.confidence_alpha,
             "system_details": self.system_details,
             "source_tokenizer": self.source_tokenizer,
@@ -223,9 +219,6 @@ def deserialize(cls, data: dict[str, SerializableData]) -> Serializable:
             dataset_split=_get_value(data, str, "dataset_split"),
             source_language=_get_value(data, str, "source_language"),
             target_language=_get_value(data, str, "target_language"),
-            reload_stat=unwrap_or(
-                _get_value(data, bool, "reload_stat"), cls.DEFAULT_RELOAD_STAT
-            ),
             confidence_alpha=confidence_alpha,
             system_details=system_details,
             source_tokenizer=_get_value(
diff --git a/explainaboard/info_test.py b/explainaboard/info_test.py
@@ -100,7 +100,6 @@ def test_serialization(self) -> None:
             dataset_split="quux",
             source_language="en",
             target_language="zh",
-            reload_stat=True,
             confidence_alpha=None,
             system_details={"detail": 123},
             source_tokenizer=tokenizer1,
@@ -126,7 +125,6 @@ def test_serialization(self) -> None:
             "dataset_split": "quux",
             "source_language": "en",
             "target_language": "zh",
-            "reload_stat": True,
             "system_details": {"detail": 123},
             "source_tokenizer": tokenizer1_serialized,
             "target_tokenizer": tokenizer2_serialized,
@@ -168,7 +166,6 @@ def test_serialization(self) -> None:
             self.assertEqual(deserialized.dataset_split, sysout.dataset_split)
             self.assertEqual(deserialized.source_language, sysout.source_language)
             self.assertEqual(deserialized.target_language, sysout.target_language)
-            self.assertEqual(deserialized.reload_stat, sysout.reload_stat)
             self.assertEqual(deserialized.confidence_alpha, sysout.confidence_alpha)
             self.assertEqual(deserialized.system_details, sysout.system_details)
             self.assertIsInstance(deserialized.source_tokenizer, SingleSpaceTokenizer)
@@ -191,7 +188,6 @@ def test_from_any_dict(self) -> None:
         self.assertIsNone(deserialized.dataset_split)
         self.assertIsNone(deserialized.source_language)
         self.assertIsNone(deserialized.target_language)
-        self.assertEqual(deserialized.reload_stat, SysOutputInfo.DEFAULT_RELOAD_STAT)
         self.assertEqual(
             deserialized.confidence_alpha, SysOutputInfo.DEFAULT_CONFIDENCE_ALPHA
         )
diff --git a/explainaboard/processors/processor.py b/explainaboard/processors/processor.py
@@ -157,14 +157,15 @@ def _get_statistics_resources(self, sys_info: SysOutputInfo) -> dict[str, Any]:
     def _statistics_func(self, samples: Iterable[Any], sys_info: SysOutputInfo) -> Any:
         ...
 
-    def _gen_external_stats(self, sys_info: SysOutputInfo) -> Any:
+    def _gen_external_stats(self, sys_info: SysOutputInfo, use_cache: bool) -> Any:
         """Generate external statistics.
 
         These are gathered from a relatively costly source, such as the training set,
         then cached for future use.
 
         Args:
             sys_info: Information about the system outputs
+            use_cache: whether to reload the statistics from cache or not.
 
         Returns:
             Statistics from, usually, the training set that are used to calculate
@@ -179,7 +180,7 @@ def _gen_external_stats(self, sys_info: SysOutputInfo) -> Any:
                 else sys_info.sub_dataset_name
             )
             # read statistics from cache
-            if sys_info.reload_stat:
+            if use_cache:
                 statistics = read_statistics_from_cache(
                     sys_info.dataset_name, sub_dataset
                 )
@@ -497,13 +498,17 @@ def sort_bucket_info(
                 raise ValueError(f"Invalid sort_by: {sort_by}")
 
     def get_overall_statistics(
-        self, metadata: dict, sys_output: list[dict]
+        self,
+        metadata: dict,
+        sys_output: list[dict],
+        use_cache: bool = True,
     ) -> OverallStatistics:
         """Get the overall statistics information of the system output.
 
         Args:
             metadata: The metadata of the system
             sys_output: The system output itself
+            use_cache: whether to reload the statistics from cache or not.
         """
         if metadata is None:
             metadata = {}
@@ -542,7 +547,7 @@ def get_overall_statistics(
         )
 
         # get scoring statistics
-        external_stats = self._gen_external_stats(sys_info)
+        external_stats = self._gen_external_stats(sys_info, use_cache)
 
         # generate cases for each level
         analysis_cases: list[list[AnalysisCase]] = []
@@ -561,19 +566,28 @@ def get_overall_statistics(
 
     @final
     def process(
-        self, metadata: dict, sys_output: list[dict], skip_failed_analyses: bool = False
+        self,
+        metadata: dict,
+        sys_output: list[dict],
+        skip_failed_analyses: bool = False,
+        use_cache: bool = True,
     ) -> SysOutputInfo:
         """Run the whole process of processing the output.
 
         Args:
             metadata: The metadata used to specify information about processing.
             sys_output: They list of system outputs.
             skip_failed_analyses: Whether to skip failed analyses.
+            use_cache: whether to reload the statistics or not.
 
         Returns:
             Information about the processed system output.
         """
-        overall_statistics = self.get_overall_statistics(metadata, sys_output)
+        overall_statistics = self.get_overall_statistics(
+            metadata,
+            sys_output,
+            use_cache,
+        )
         sys_info = unwrap(overall_statistics.sys_info)
         analyses = self.perform_analyses(
             sys_info,
diff --git a/integration_tests/artifacts/reports/test-ar_6960.json b/integration_tests/artifacts/reports/test-ar_6960.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "ar",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-de_7213.json b/integration_tests/artifacts/reports/test-de_7213.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "de",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-de_9330.json b/integration_tests/artifacts/reports/test-de_9330.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "de",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-de_9335.json b/integration_tests/artifacts/reports/test-de_9335.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "de",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_7676.json b/integration_tests/artifacts/reports/test-en_7676.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_7872.json b/integration_tests/artifacts/reports/test-en_7872.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_8113.json b/integration_tests/artifacts/reports/test-en_8113.json
@@ -7,7 +7,6 @@
         "F1ScoreQA",
         "ExactMatchQA"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_8235.json b/integration_tests/artifacts/reports/test-en_8235.json
@@ -7,7 +7,6 @@
         "F1ScoreQA",
         "ExactMatchQA"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_9152.json b/integration_tests/artifacts/reports/test-en_9152.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-en_9200.json b/integration_tests/artifacts/reports/test-en_9200.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "en",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_7377.json b/integration_tests/artifacts/reports/test-es_7377.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_7678.json b/integration_tests/artifacts/reports/test-es_7678.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_7687.json b/integration_tests/artifacts/reports/test-es_7687.json
@@ -7,7 +7,6 @@
         "F1ScoreQA",
         "ExactMatchQA"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_7698.json b/integration_tests/artifacts/reports/test-es_7698.json
@@ -7,7 +7,6 @@
         "F1ScoreQA",
         "ExactMatchQA"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_9340.json b/integration_tests/artifacts/reports/test-es_9340.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-es_9342.json b/integration_tests/artifacts/reports/test-es_9342.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "es",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-fr_9262.json b/integration_tests/artifacts/reports/test-fr_9262.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "fr",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-fr_9332.json b/integration_tests/artifacts/reports/test-fr_9332.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "fr",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-ja_9137.json b/integration_tests/artifacts/reports/test-ja_9137.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "ja",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-ja_9150.json b/integration_tests/artifacts/reports/test-ja_9150.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "ja",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-zh_7117.json b/integration_tests/artifacts/reports/test-zh_7117.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "zh",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-zh_7311.json b/integration_tests/artifacts/reports/test-zh_7311.json
@@ -6,7 +6,6 @@
     "metric_names": [
         "Accuracy"
     ],
-    "reload_stat": true,
     "is_print_case": true,
     "language": "zh",
     "confidence_alpha": 0.05,
diff --git a/integration_tests/artifacts/reports/test-zh_7396.json b/integration_tests/artifacts/reports/test-zh_7396.json
diff --git a/integration_tests/artifacts/reports/test-zh_7443.json b/integration_tests/artifacts/reports/test-zh_7443.json
diff --git a/integration_tests/artifacts/reports/test-zh_8680.json b/integration_tests/artifacts/reports/test-zh_8680.json
diff --git a/integration_tests/artifacts/reports/test-zh_8705.json b/integration_tests/artifacts/reports/test-zh_8705.json
diff --git a/integration_tests/text_classification_test.py b/integration_tests/text_classification_test.py

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`"task_name": "aspect-based-sentiment-classification",`
`3`	`3`	`"source_language": "en",`
`4`	`4`	`"target_language": "en",`
`5`		`- "reload_stat": true,`
`6`	`5`	`"source_tokenizer": {`
`7`	`6`	`"cls_name": "SingleSpaceTokenizer"`
`8`	`7`	`},`
`@@ -1666,4 +1665,4 @@`
`1666`	`1665`	`}`
`1667`	`1666`	`]`
`1668`	`1667`	`}`
`1669`		`-}`
	`1668`	`+}`