[ci] use pre-commit, update actions (#61)

jameslamb · web-flow · commit 4c4e4a3ea707 · 2024-06-20T21:08:42.000-05:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -14,17 +14,17 @@ jobs:
           - task: linting
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
       - name: Set up Python
-        uses: conda-incubator/setup-miniconda@v2
+        uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: 3.11
       - name: linting
         if: matrix.task == 'linting'
         shell: bash
         run: |
-          pip install --upgrade black flake8 isort nbqa
-          make lint
+          pip install --upgrade pre-commit
+          pre-commit run --all-files
   all-tests-successful:
     if: always()
     runs-on: ubuntu-latest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,49 @@
+---
+exclude: |
+  (?x)^(
+      LightGBM
+  )$
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-toml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+        args: ["--settings-path", "pyproject.toml"]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.10.0
+    hooks:
+      - id: mypy
+        args: ["--config-file", "pyproject.toml"]
+        exclude: "tests"
+        additional_dependencies:
+          - types-requests
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.10
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: ["--config", "pyproject.toml"]
+        types_or: [jupyter, python]
+      # Run the formatter.
+      - id: ruff-format
+        args: ["--config", "pyproject.toml"]
+        types_or: [python, jupyter]
+  - repo: https://github.com/maxwinterstein/shfmt-py
+    rev: v3.7.0.1
+    hooks:
+      - id: shfmt
+        args: ["--indent=4", "--space-redirects", "--write"]
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+        args: ["--exclude=SC2002"]
diff --git a/Makefile b/Makefile
@@ -68,13 +68,6 @@ ecr-details.json:
 			--repository-name ${CLUSTER_IMAGE_NAME} \
 	> ./ecr-details.json
 
-.PHONY: format
-format:
-	black .
-	isort .
-	nbqa isort .
-	nbqa black .
-
 $(LIGHTGBM_REPO):
 	git clone --recursive https://github.com/microsoft/LightGBM.git
 
@@ -100,15 +93,6 @@ lightgbm-unit-tests:
 		/bin/bash -cex \
 			"sh ./build-python.sh install --precompile && pip install pytest && pytest -vv -rA tests/python_package_test/test_dask.py"
 
-.PHONY: lint
-lint: lint-dockerfiles
-	isort --check .
-	black --check --diff .
-	flake8 --count .
-	nbqa black --check --diff .
-	nbqa flake8 .
-	nbqa isort --check .
-
 .PHONY: lint-dockerfiles
 lint-dockerfiles:
 	for dockerfile in $$(ls | grep -E '^Dockerfile'); do \
diff --git a/bin/profile-example-memory-usage.sh b/bin/profile-example-memory-usage.sh
@@ -8,17 +8,19 @@ set -e -u -o pipefail
 
 echo "profiling examples"
 mkdir -p "${PROFILING_OUTPUT_DIR}/bin"
+
+# shellcheck disable=SC2044
 for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do
     base_filename=$(basename "${py_script}")
-    prof_file=$(echo "${base_filename}" | sed -e 's/\.py/\.bin/g')
-    table_file=$(echo "${base_filename}" | sed -e 's/\.py/-table\.html/g')
-    leak_table_file=$(echo "${base_filename}" | sed -e 's/\.py/-leak-table\.html/g')
-    flamegraph_file=$(echo "${base_filename}" | sed -e 's/\.py/-flamegraph\.html/g')
+    prof_file="${base_filename/.py/.bin}"
+    table_file="${base_filename/.py/-table.html}"
+    leak_table_file="${base_filename/.py/-leak-table.html}"
+    flamegraph_file="${base_filename/.py/-flamegraph.html}"
     echo "  - ${base_filename}"
     memray run \
         -o "${PROFILING_OUTPUT_DIR}/bin/${prof_file}" \
-        "${py_script}" 2>&1 > /dev/null \
-    || true
+        "${py_script}" > /dev/null 2>&1 ||
+        true
     memray table \
         -o "${PROFILING_OUTPUT_DIR}/${table_file}" \
         --force \
diff --git a/bin/profile-examples.sh b/bin/profile-examples.sh
@@ -7,15 +7,16 @@
 set -e -u -o pipefail
 
 echo "profiling examples"
+# shellcheck disable=SC2044
 for py_script in $(find "${LIGHTGBM_HOME}/examples/python-guide" -name '*.py'); do
     base_filename=$(basename "${py_script}")
-    prof_file=$(echo "${base_filename}" | sed -e 's/\.py/\.prof/g')
+    prof_file="${base_filename/.py/.prof}"
     echo "  - ${base_filename}"
     python \
         -Wignore \
         -m cProfile \
         -o "${PROFILING_OUTPUT_DIR}/${prof_file}" \
-        "${py_script}" 2>&1 > /dev/null \
-    || true
+        "${py_script}" > /dev/null 2>&1 ||
+        true
 done
 echo "Done profiling examples. See '${PROFILING_OUTPUT_DIR}' for results."
diff --git a/jupyter_notebook_config.py b/jupyter_notebook_config.py
@@ -1,3 +1,4 @@
+# mypy: disable-error-code="name-defined"
 c.ServerApp.token = ""
 c.ServerApp.password = ""
 c.ServerApp.open_browser = False
diff --git a/notebooks/_img/dask-horizontal.svg b/notebooks/_img/dask-horizontal.svg
@@ -1,7 +1,7 @@
-<svg id="Layer_1" 
-     data-name="Layer 1" 
-     xmlns="http://www.w3.org/2000/svg" 
-     xmlns:xlink="http://www.w3.org/1999/xlink" 
+<svg id="Layer_1"
+     data-name="Layer 1"
+     xmlns="http://www.w3.org/2000/svg"
+     xmlns:xlink="http://www.w3.org/1999/xlink"
      viewBox="0 0 550 247.95">
   <defs>
     <linearGradient id="linear-gradient" x1="154.55" y1="173.33" x2="242.36" y2="173.33" gradientTransform="translate(-26.62 -73.73) rotate(7.91)" gradientUnits="userSpaceOnUse">
diff --git a/notebooks/demo-aws.ipynb b/notebooks/demo-aws.ipynb
@@ -51,7 +51,9 @@
     "with open(\"../ecr-details.json\", \"r\") as f:\n",
     "    ecr_details = json.loads(f.read())\n",
     "\n",
-    "CONTAINER_IMAGE = ecr_details[\"repository\"][\"repositoryUri\"] + \":\" + os.environ[\"DASK_VERSION\"]\n",
+    "CONTAINER_IMAGE = (\n",
+    "    ecr_details[\"repository\"][\"repositoryUri\"] + \":\" + os.environ[\"DASK_VERSION\"]\n",
+    ")\n",
     "print(f\"scheduler and worker image: {CONTAINER_IMAGE}\")"
    ]
   },
diff --git a/notebooks/testing/ranker-local.ipynb b/notebooks/testing/ranker-local.ipynb
@@ -57,9 +57,12 @@
     "    avg_gs=10,\n",
     "    random_state=0,\n",
     "):\n",
-    "    \"\"\"Generate a learning-to-rank dataset - feature vectors grouped together with\n",
-    "    integer-valued graded relevance scores. Replace this with a sklearn.datasets function\n",
-    "    if ranking objective becomes supported in sklearn.datasets module.\"\"\"\n",
+    "    \"\"\"\n",
+    "    Generate a learning-to-rank dataset - feature vectors grouped\n",
+    "    together with integer-valued graded relevance scores. Replace this\n",
+    "    with a sklearn.datasets function if ranking objective becomes\n",
+    "    supported in sklearn.datasets module.\n",
+    "    \"\"\"\n",
     "    rnd_generator = check_random_state(random_state)\n",
     "\n",
     "    y_vec, group_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)\n",
@@ -84,7 +87,8 @@
     "    x_grid = np.linspace(0, stop=1, num=gmax + 2)\n",
     "    X = rnd_generator.uniform(size=(n_samples, n_features))\n",
     "\n",
-    "    # make first n_informative features values bucketed according to relevance scores.\n",
+    "    # make first n_informative features values\n",
+    "    # bucketed according to relevance scores.\n",
     "    def bucket_fn(z):\n",
     "        return rnd_generator.uniform(x_grid[z], high=x_grid[z + 1])\n",
     "\n",
@@ -102,12 +106,14 @@
     "    g_rle = np.array([sum([1 for _ in grp]) for _, grp in itertools.groupby(g)])\n",
     "\n",
     "    if output == \"dataframe\":\n",
-    "        # add target, weight, and group to DataFrame so that partitions abide by group boundaries.\n",
+    "        # add target, weight, and group to DataFrame so that\n",
+    "        # partitions abide by group boundaries.\n",
     "        X_df = pd.DataFrame(X, columns=[f\"feature_{i}\" for i in range(X.shape[1])])\n",
     "        X = X_df.copy()\n",
     "        X_df = X_df.assign(y=y, g=g, w=w)\n",
     "\n",
-    "        # set_index ensures partitions are based on group id. See https://bit.ly/3pAWyNw.\n",
+    "        # set_index ensures partitions are based on group id.\n",
+    "        # See https://bit.ly/3pAWyNw.\n",
     "        X_df.set_index(\"g\", inplace=True)\n",
     "        dX = dd.from_pandas(X_df, chunksize=chunk_size)\n",
     "\n",
@@ -117,12 +123,16 @@
     "        dX = dX.drop(columns=[\"y\", \"w\"])\n",
     "        dg = dX.index.to_series()\n",
     "\n",
-    "        # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting\n",
+    "        # encode group identifiers into run-length encoding,\n",
+    "        # the format LightGBMRanker is expecting\n",
     "        # so that within each partition, sum(g) = n_samples.\n",
-    "        dg = dg.map_partitions(lambda p: p.groupby(\"g\", sort=False).apply(lambda z: z.shape[0]))\n",
+    "        dg = dg.map_partitions(\n",
+    "            lambda p: p.groupby(\"g\", sort=False).apply(lambda z: z.shape[0])\n",
+    "        )\n",
     "\n",
     "    elif output == \"array\":\n",
-    "        # ranking arrays: one chunk per group. Each chunk must include all columns.\n",
+    "        # ranking arrays: one chunk per group.\n",
+    "        # Each chunk must include all columns.\n",
     "        p = X.shape[1]\n",
     "        dX, dy, dw, dg = list(), list(), list(), list()\n",
     "        for g_idx, rhs in enumerate(np.cumsum(g_rle)):\n",
@@ -138,7 +148,9 @@
     "        dg = da.concatenate(dg, axis=0)\n",
     "\n",
     "    else:\n",
-    "        raise ValueError(\"ranking data creation only supported for Dask arrays and dataframes\")\n",
+    "        raise ValueError(\n",
+    "            \"ranking data creation only supported for Dask arrays and dataframes\"\n",
+    "        )\n",
     "\n",
     "    return X, y, w, g_rle, dX, dy, dw, dg"
    ]
@@ -219,7 +231,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# relative difference between distributed ranker and local ranker spearman corr should be small.\n",
+    "# relative difference between distributed ranker\n",
+    "# and local ranker spearman corr should be small.\n",
     "lcor = spearmanr(rnkvec_local, y).correlation\n",
     "print(np.abs(dcor - lcor))\n",
     "assert np.abs(dcor - lcor) < 0.003"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,33 @@
-[tool.black]
-line-length = 100
-exclude = '''
-/(
-  | LightGBM
-)/
-'''
+[tool.ruff.lint]
+select = [
+    # flake8-bugbear
+    "B",
+    # flake8-comprehensions
+    "C4",
+    # pycodestyle
+    "E",
+    # pyflakes
+    "F",
+    # NumPy-specific rules
+    "NPY",
+    # pylint
+    "PL",
+    # flake8-return: unnecessary assignment before return
+    "RET504",
+    # flake8-simplify: use dict.get() instead of an if-else block
+    "SIM401",
+]
 
-[tool.nbqa.exclude]
-black = "LightGBM/"
-flake8 = "LightGBM/"
-isort = "LightGBM/"
+[tool.ruff.lint.per-file-ignores]
+"*.ipynb" = [
+  # (pylint) Unnecessary list() call
+  "C408",
+  # (pylint) too many arguments in function definition
+  "PLR0913",
+  # (pylint) Magic value used in comparison
+  "PLR2004",
+]
+"jupyter_notebook_config.py" = [
+    # (flake8) undefined name
+    "F821",
+]
diff --git a/setup.cfg b/setup.cfg

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+# mypy: disable-error-code="name-defined"`
`1`	`2`	`c.ServerApp.token = ""`
`2`	`3`	`c.ServerApp.password = ""`
`3`	`4`	`c.ServerApp.open_browser = False`