Skip to content

Commit 677ff40

Browse files
Drop Python 3.8 support. (#1574)
* Drop Python 3.8 support. * Formatting
1 parent 9b33995 commit 677ff40

20 files changed

+74
-72
lines changed

.github/workflows/python-package.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ jobs:
111111
matrix:
112112
os: [ubuntu-latest, macos-latest, windows-latest]
113113
# The specific Python version is irrelevant in this context as we are only packaging non-C extension
114-
# code. This ensures compatibility across Python versions, including Python 3.8, as compatibility is
114+
# code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
115115
# dictated by the packaged code itself, not the Python version used for packaging.
116116
python-version: ["3.10"]
117117
arch: [x86_64, aarch64]

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: v0.6.9
3+
rev: v0.11.2
44
hooks:
55
- id: ruff
66
args:

benchmarking/int8/int8_benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,4 @@
6565
print("=" * 40)
6666
print(f"Example:\n{tokenizer.decode(generated_ids[0])}")
6767
print("=" * 40)
68-
print(f"Speed: {num/(time.time() - time_1)}token/s")
68+
print(f"Speed: {num / (time.time() - time_1)}token/s")

benchmarking/matmul_benchmark.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def test_bench_matmul(batch, seq, model, hidden):
6666
torch.matmul(A, B.t())
6767
torch.cuda.synchronize()
6868
print(
69-
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s",
69+
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s",
7070
)
7171

7272
# torch.cuda.synchronize()
@@ -88,22 +88,24 @@ def test_bench_matmul(batch, seq, model, hidden):
8888
for i in range(iters):
8989
bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4)
9090
torch.cuda.synchronize()
91-
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
91+
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s")
9292

9393
torch.cuda.synchronize()
9494
t0 = time.time()
9595
for i in range(iters):
9696
bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c)
9797
torch.cuda.synchronize()
98-
print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
98+
print(
99+
f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
100+
)
99101

100102
torch.cuda.synchronize()
101103
t0 = time.time()
102104
for i in range(iters):
103105
bnb.matmul(A, B)
104106
torch.cuda.synchronize()
105107
print(
106-
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
108+
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
107109
)
108110

109111
torch.cuda.synchronize()
@@ -112,7 +114,7 @@ def test_bench_matmul(batch, seq, model, hidden):
112114
bnb.matmul(A, B, threshold=6.0)
113115
torch.cuda.synchronize()
114116
print(
115-
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
117+
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
116118
)
117119

118120
CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
@@ -124,7 +126,7 @@ def test_bench_matmul(batch, seq, model, hidden):
124126
out32 = F.int8_linear_matmul(CA, CB)
125127
torch.cuda.synchronize()
126128
print(
127-
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
129+
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
128130
)
129131

130132
# C32A, SA = F.transform(CA, "col32")
@@ -183,7 +185,7 @@ def test_bench_matmul(batch, seq, model, hidden):
183185
linear8bit(A)
184186
torch.cuda.synchronize()
185187
print(
186-
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
188+
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
187189
)
188190

189191
linearMixedBit(A)
@@ -193,7 +195,7 @@ def test_bench_matmul(batch, seq, model, hidden):
193195
linearMixedBit(A)
194196
torch.cuda.synchronize()
195197
print(
196-
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
198+
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
197199
)
198200

199201
# linear8bit_train(A)

bitsandbytes/_ops.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
from collections.abc import Sequence
12
from math import prod
2-
from typing import Optional, Sequence, Tuple
3+
from typing import Optional
34

45
import torch
56

@@ -131,7 +132,7 @@ def _(
131132
def _(
132133
A: torch.Tensor,
133134
threshold=0.0,
134-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
135+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
135136
out_row = torch.empty_like(A, dtype=torch.int8)
136137
out_col = torch.empty_like(A, dtype=torch.int8)
137138
row_stats = torch.empty(prod(A.shape[:-1]), device=A.device, dtype=torch.float32)
@@ -191,7 +192,7 @@ def _(
191192
@register_fake("bitsandbytes::quantize_4bit")
192193
def _(
193194
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
194-
) -> Tuple[torch.Tensor, torch.Tensor]:
195+
) -> tuple[torch.Tensor, torch.Tensor]:
195196
torch._check_is_size(blocksize)
196197

197198
n = A.numel()
@@ -235,7 +236,7 @@ def _(
235236

236237

237238
@register_fake("bitsandbytes::quantize_blockwise")
238-
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
239+
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
239240
torch._check_is_size(blocksize)
240241
n = A.numel()
241242
blocks = -(n // -blocksize)

bitsandbytes/autograd/_functions.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from dataclasses import dataclass
22
from math import prod
3-
from typing import Callable, Optional, Tuple
3+
from typing import Callable, Optional
44
import warnings
55
from warnings import warn
66

@@ -55,7 +55,7 @@ def get_current_outlier_idx(self):
5555
)
5656
def get_inverse_transform_indices(
5757
transform_tile: Callable[[torch.Tensor], torch.Tensor],
58-
tile_size: Tuple[int, int],
58+
tile_size: tuple[int, int],
5959
):
6060
"""
6161
Compute a permutation of indices that invert the specified (tiled) matrix transformation

bitsandbytes/backends/cpu/ops.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import ctypes as ct
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import torch
55

@@ -47,7 +47,7 @@ def _(
4747

4848

4949
@register_kernel("bitsandbytes::quantize_blockwise", "cpu")
50-
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
50+
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
5151
torch._check_is_size(blocksize)
5252
torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on cpu, got {A.dtype}")
5353

@@ -116,7 +116,7 @@ def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int,
116116
@register_kernel("bitsandbytes::quantize_4bit", "cpu")
117117
def _(
118118
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
119-
) -> Tuple[torch.Tensor, torch.Tensor]:
119+
) -> tuple[torch.Tensor, torch.Tensor]:
120120
torch._check_is_size(blocksize)
121121
torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")
122122

bitsandbytes/backends/cuda/ops.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
from collections.abc import Sequence
12
import ctypes as ct
23
from math import prod
3-
from typing import Optional, Sequence, Tuple
4+
from typing import Optional
45

56
import torch
67

@@ -78,10 +79,7 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
7879
raise NotImplementedError("int8_linear_matmul not implemented!")
7980
else:
8081
raise RuntimeError(
81-
f"cublasLt ran into an error!\n"
82-
f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
83-
f"\t{(lda, ldb, ldc)=}\n"
84-
f"\t{(m, n, k)=}"
82+
f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
8583
)
8684

8785
return out
@@ -169,7 +167,7 @@ def _(A: torch.Tensor, threshold=0.0):
169167
def _(
170168
A: torch.Tensor,
171169
threshold=0.0,
172-
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
170+
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
173171
# Use CUDA kernel for rowwise and COO tensor
174172
quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
175173
A,
@@ -188,7 +186,7 @@ def _(
188186
def _get_col_absmax(
189187
A: torch.Tensor,
190188
threshold=0.0,
191-
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
189+
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
192190
torch._check(A.is_floating_point())
193191

194192
outlier_mask = None
@@ -207,7 +205,7 @@ def _get_col_absmax(
207205

208206

209207
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
210-
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
208+
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
211209
torch._check_is_size(blocksize)
212210
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
213211
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
@@ -292,7 +290,7 @@ def _dequantize_blockwise_impl(
292290
@register_kernel("bitsandbytes::quantize_4bit", "cuda")
293291
def _(
294292
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
295-
) -> Tuple[torch.Tensor, torch.Tensor]:
293+
) -> tuple[torch.Tensor, torch.Tensor]:
296294
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
297295
torch._check(quant_type in ["fp4", "nf4"])
298296
torch._check(

bitsandbytes/cuda_specs.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
import dataclasses
22
from functools import lru_cache
3-
from typing import List, Optional, Tuple
3+
from typing import Optional
44

55
import torch
66

77

88
@dataclasses.dataclass(frozen=True)
99
class CUDASpecs:
10-
highest_compute_capability: Tuple[int, int]
10+
highest_compute_capability: tuple[int, int]
1111
cuda_version_string: str
12-
cuda_version_tuple: Tuple[int, int]
12+
cuda_version_tuple: tuple[int, int]
1313

1414
@property
1515
def has_imma(self) -> bool:
1616
return torch.version.hip or self.highest_compute_capability >= (7, 5)
1717

1818

19-
def get_compute_capabilities() -> List[Tuple[int, int]]:
19+
def get_compute_capabilities() -> list[tuple[int, int]]:
2020
return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count()))
2121

2222

2323
@lru_cache(None)
24-
def get_cuda_version_tuple() -> Tuple[int, int]:
24+
def get_cuda_version_tuple() -> tuple[int, int]:
2525
if torch.version.cuda:
2626
return map(int, torch.version.cuda.split(".")[0:2])
2727
elif torch.version.hip:

bitsandbytes/diagnostics/cuda.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
from collections.abc import Iterable, Iterator
12
import logging
23
import os
34
from pathlib import Path
4-
from typing import Dict, Iterable, Iterator
55

66
import torch
77

@@ -76,7 +76,7 @@ def is_relevant_candidate_env_var(env_var: str, value: str) -> bool:
7676
)
7777

7878

79-
def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]:
79+
def get_potentially_lib_path_containing_env_vars() -> dict[str, str]:
8080
return {env_var: value for env_var, value in os.environ.items() if is_relevant_candidate_env_var(env_var, value)}
8181

8282

bitsandbytes/functional.py

+11-10
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
#
33
# This source code is licensed under the MIT license found in the
44
# LICENSE file in the root directory of this source tree.
5+
from collections.abc import Iterable
56
import ctypes as ct
67
import itertools
78
from math import prod
8-
from typing import Any, Dict, Iterable, Optional, Tuple, Union
9+
from typing import Any, Optional, Union
910

1011
import numpy as np
1112
import torch
@@ -619,7 +620,7 @@ def __get_item__(self, idx):
619620
return list_repr[idx]
620621

621622
@classmethod
622-
def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> "QuantState":
623+
def from_dict(cls, qs_dict: dict[str, Any], device: torch.device) -> "QuantState":
623624
"""
624625
unpacks components of state_dict into QuantState
625626
where necessary, convert into strings, torch.dtype, ints, etc.
@@ -741,7 +742,7 @@ def quantize_blockwise(
741742
out: Optional[torch.Tensor] = None,
742743
blocksize=4096,
743744
nested=False,
744-
) -> Tuple[torch.Tensor, QuantState]:
745+
) -> tuple[torch.Tensor, QuantState]:
745746
"""Quantize a tensor in blocks of values.
746747
747748
The input tensor is quantized by dividing it into blocks of `blocksize` values.
@@ -994,7 +995,7 @@ def quantize_4bit(
994995
compress_statistics=False,
995996
quant_type="fp4",
996997
quant_storage=torch.uint8,
997-
) -> Tuple[torch.Tensor, QuantState]:
998+
) -> tuple[torch.Tensor, QuantState]:
998999
"""Quantize tensor A in blocks of 4-bit values.
9991000
10001001
Quantizes tensor A by dividing it into blocks which are independently quantized.
@@ -1161,7 +1162,7 @@ def quantize(
11611162
A: Tensor,
11621163
code: Optional[torch.Tensor] = None,
11631164
out: Optional[torch.Tensor] = None,
1164-
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
1165+
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
11651166
if code is None:
11661167
if "dynamic" not in name2qmap:
11671168
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
@@ -1179,7 +1180,7 @@ def quantize(
11791180
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
11801181
def dequantize(
11811182
A: Tensor,
1182-
state: Optional[Tuple[Tensor, Tensor]] = None,
1183+
state: Optional[tuple[Tensor, Tensor]] = None,
11831184
absmax: Optional[torch.Tensor] = None,
11841185
code: Optional[torch.Tensor] = None,
11851186
out: Optional[torch.Tensor] = None,
@@ -2006,7 +2007,7 @@ def get_colrow_absmax(
20062007
col_stats: Optional[torch.Tensor] = None,
20072008
nnz_block_ptr: Optional[torch.Tensor] = None,
20082009
threshold=0.0,
2009-
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
2010+
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
20102011
""" "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
20112012
20122013
The row-wise and column-wise absmax values are determined.
@@ -2268,9 +2269,9 @@ def spmm_coo(
22682269
out: Optional[torch.Tensor] = None,
22692270
):
22702271
if not isinstance(cooA, COOSparseTensor):
2271-
assert (
2272-
cooA.is_sparse and cooA.layout == torch.sparse_coo
2273-
), "Tensor must be `COOSparseTensor or a PyTorch COO tensor."
2272+
assert cooA.is_sparse and cooA.layout == torch.sparse_coo, (
2273+
"Tensor must be `COOSparseTensor or a PyTorch COO tensor."
2274+
)
22742275

22752276
# Convert to custom COOSparseTensor
22762277
cooA = COOSparseTensor(

bitsandbytes/nn/modules.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# This source code is licensed under the MIT license found in the
44
# LICENSE file in the root directory of this source tree.
55
import copy
6-
from typing import Any, Dict, Optional, TypeVar, Union, overload
6+
from typing import Any, Optional, TypeVar, Union, overload
77
import warnings
88

99
import torch
@@ -268,7 +268,7 @@ def __copy__(self):
268268
def from_prequantized(
269269
cls,
270270
data: torch.Tensor,
271-
quantized_stats: Dict[str, Any],
271+
quantized_stats: dict[str, Any],
272272
requires_grad: bool = False,
273273
device="cuda",
274274
module: Optional["Linear4bit"] = None,

0 commit comments

Comments
 (0)