diff --git a/README.md b/README.md index ace12fb..9e34d7c 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,39 @@ https://dbc-dp-xxxx.cloud.databricks.com/driver-proxy/o/xxxx/xx-xxx-xxxx/8087/st ![](https://user-images.githubusercontent.com/1610850/281441285-9b84d5f1-d58a-45dc-9354-7385e1599d1f.png) +### Troubleshooting with cluster logs + +If you're experiencing problems starting your Dask Databricks cluster then viewing logs for your init scripts can help narrow down the problem. + +When you create your cluster we recommend that you [configure your logs](https://docs.databricks.com/en/clusters/configure.html#cluster-log-delivery) to write to somewhere like `dbfs:/cluster_init_logs`. + +To make viewing these logs a little easier we've included a couple of CLI utilities in `dask-databricks` to help you navigate them. + +#### Listing clusters + +You can get a full list of available logs with the `dask databricks ls ` command where the path is the DBFS location you configured your logs to write to. + +```console +$ dask databricks logs ls dbfs:/cluster_init_logs + + Cluster Start time Node Count Node IPs + ────────────────────────────────────────────────────────────────────────────────────── + 1234-987654-a1b2c3d4 Nov 16 2023 10:36 2 10.0.0.1, 10.0.0.2 +``` + +#### Viewing logs + +Once you have your cluster ID you can view the logs from the latest launch of that cluster with `dask databricks cat `. + +```console +$ dask databricks logs cat dbfs:/cluster_init_logs 1234-987654-a1b2c3d4 +Cluster: 1234-987654-a1b2c3d4 +Start time: Nov 16 2023 10:36 +10.0.0.1: Start Python bootstrap +10.0.0.1: PYSPARK_PYTHON is /databricks/python3/bin/python +... +``` + ## Releasing Releases of this project are automated using [GitHub Actions and the `pypa/gh-action-pypi-publish` action](https://github.com/jacobtomlinson/dask-databricks/blob/main/.github/workflows/release.yaml). diff --git a/dask_databricks/cli.py b/dask_databricks/cli.py index 91d317a..0a18932 100644 --- a/dask_databricks/cli.py +++ b/dask_databricks/cli.py @@ -1,13 +1,31 @@ import json import logging import os +import random import socket import subprocess import sys import time +from datetime import datetime import click +from rich import box +from rich.color import ANSI_COLOR_NAMES +from rich.console import Console from rich.logging import RichHandler +from rich.table import Table + +console = Console() + +NODE_COLOURS = ["medium_spring_green", "light_steel_blue1", "wheat1", "medium_orchid"] + +# Generate list of random colours from rich +# import random +# from rich.color import Color +# +# for i in range(100): +# colour = Color.random() +# print(f'"{colour.name}",', end=" def get_logger(): @@ -96,5 +114,121 @@ def run(worker_command, worker_args, cuda): sys.exit(1) +@main.group() +def logs(): + """View cluster init logs.""" + + +def _get_logs_at_path(path): + try: + from databricks.sdk.runtime import dbutils + except ImportError: + raise RuntimeError("Please install databricks-sdk.") + clusters = {} + + for cluster in dbutils.fs.ls(path): + cluster_id = cluster.path.split("/")[-1] + clusters[cluster_id] = {} + for node in dbutils.fs.ls(cluster.path + "/init_scripts"): + for log in dbutils.fs.ls(node.path): + filename = log.path.split("/")[-1] + channel = filename.split(".")[-2] + datetime = "_".join(filename.split("_")[:2]) + node_name = log.path.split("/")[-2].split("_", 1)[-1].replace("_", ".") + if datetime not in clusters[cluster_id]: + clusters[cluster_id][datetime] = {} + + if node_name not in clusters[cluster_id][datetime]: + clusters[cluster_id][datetime][node_name] = {} + + clusters[cluster_id][datetime][node_name][channel] = log.path + return clusters + + +def _get_node_color(i): + if i < len(NODE_COLOURS): + return NODE_COLOURS[i] + else: + return random.choice(list(ANSI_COLOR_NAMES)) + + +def _prettify_launch_time(launch_time): + return datetime.strptime(launch_time, "%Y%m%d_%H%M%S").strftime("%b %d %Y %H:%M") + + +@logs.command() +@click.argument("path") +@click.option("--show-filenames", help="Show filenames in the output", is_flag=True, default=False, show_default=True) +def ls(path, show_filenames): + # TODO add flag to list filenames + table = Table(box=box.SIMPLE_HEAD) + table.add_column("Cluster", style="cyan", no_wrap=True) + table.add_column("Start time", style="plum2") + table.add_column("Node Count") + table.add_column("Node IPs") + if show_filenames: + table.add_column("Filenames") + with console.status("[bright_black]Finding logs..."): + clusters = _get_logs_at_path(path) + for cluster in clusters: + first = True + for launch_time in sorted(clusters[cluster], reverse=True): + pretty_launch_time = _prettify_launch_time(launch_time) + cluster_name = cluster if first else "" + node_list = ", ".join( + f"[{_get_node_color(i)}]{name}[/{_get_node_color(i)}]" + for i, name in enumerate(clusters[cluster][launch_time]) + ) + data = [cluster_name, pretty_launch_time, str(len(clusters[cluster][launch_time])), node_list] + if show_filenames: + filenames = "" + for i, node in enumerate(clusters[cluster][launch_time]): + for channel in ["stdout", "stderr"]: + node_colour = _get_node_color(i) + filenames += f"[{node_colour}]{clusters[cluster][launch_time][node][channel]}[/{node_colour}]\n" + data.append(filenames) + table.add_row(*data) + first = False + + console.print(table) + + +@logs.command() +@click.argument("path") +@click.argument("cluster") +def cat(path, cluster): + # TODO add a flag for selecting which start time to view + # TODO add a flag to filter which nodes to view logs for + try: + from databricks.sdk.runtime import dbutils + except ImportError: + raise RuntimeError("Please install databricks-sdk.") + + with console.status("[bright_black]Finding logs..."): + clusters = _get_logs_at_path(path) + + if cluster not in clusters: + console.print(f"Cluster {cluster} not found.", style="bold red", highlight=False) + console.print( + f"Hint: Try running dask [b i]databricks logs ls {path}[/b i] to list clusters.", + style="bright_black", + highlight=False, + ) + sys.exit(1) + + most_recent = sorted(clusters[cluster].keys())[-1] + + console.print(f"Cluster: {cluster}", style="bold cyan", highlight=False) + console.print(f"Start time: {_prettify_launch_time(most_recent)}", style="bold cyan", highlight=False) + + for i, node in enumerate(clusters[cluster][most_recent]): + for channel in ["stdout", "stderr"]: + for line in dbutils.fs.head(clusters[cluster][most_recent][node][channel], 65536).split("\n"): + node_colour = _get_node_color(i) + console.print( + f"[{node_colour}]{node}[/{node_colour}]: {line}", style="grey89" if channel == "stdout" else "plum4" + ) + + if __name__ == "__main__": main()