Skip to content

Meta/pymc #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 49 additions & 38 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,95 +1,106 @@
#!/usr/bin/make -f
SHELL = /bin/sh
.DELETE_ON_ERROR:

# boilerplate variables, do not edit
MAKEFILE_PATH := $(abspath $(firstword $(MAKEFILE_LIST)))
MAKEFILE_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

# required values, set to defaults here if not given in config.mk
PACKAGE_DIR ?= src
TEST_DIR ?= tests
STUB_DIR ?= stubs
LINTING_LINELENGTH ?= 120
PYTHON ?= python
CODECOV_TOKEN ?= ${CODECOV_TOKEN}

CURRENT_SIGN_SETTING := $(shell git config commit.gpgSign)
PACKAGE_DIRECTORY=bayesian_hmm
TEST_DIRECTORY=tests
LINTING_LINELENGTH=120
STUB_DIRECTORY=stubs

.PHONY: help clean-pyc clean-build isort-test isort darglint-test black-test black stubs mypy-test pytest test format tox

help:
@echo " help: Print this help"
@echo " clean-pyc: Remove python artifacts."
@echo " clean-build: Remove build artifacts."
@echo " isort-test: Test whether import statements are sorted."
@echo " isort: Sort import statements."
@echo " darglint-test: Test whether docstrings are valid."
@echo " black-test: Test whether black formatting is adhered to."
@echo " black: Apply black formatting."
@echo " stubs: Run stub generation."
@echo " mypy-test: Test whether mypy type annotations are sufficient."
@echo " pytest: Run pytest suite."
@echo " test: Run all tests."
@echo " format: Apply all formatting tools."
@echo " profile: Run cProfile on the MCMC example and print longest running functions."
@echo " tox: Run tox testing."
@echo " release: Build and upload to PyPI."

# use regex to go through this Makefile and print help lines
# help lines is any comment starting with double '#' (see next comment). Prints alphabetical order.
## help : print this help.
.PHONY: help
help: Makefile
@echo "\nBayesian Hidden Markov Models."
@echo "\n Generic commands"
@sed -n 's/^## / /p' $< | sort

## clean: Remove python artifacts.
.PHONY: clean-pyc clean-build clean
clean-pyc:
find . -regex '^./\($(PACKAGE_DIRECTORY)\|$(TEST_DIRECTORY)\)/.*\.py[co]' -delete
find . -regex '^./\($(PACKAGE_DIRECTORY)\|$(TEST_DIRECTORY)\)/.*__pycache__' -delete
find . -regex '^./\($(PACKAGE_DIR)\|$(TEST_DIR)\)/.*\.py[co]' -delete
find . -regex '^./\($(PACKAGE_DIR)\|$(TEST_DIR)\)/.*__pycache__' -delete

clean-build:
rm --force --recursive build/
rm --force --recursive dist/
rm --force --recursive *.egg-info

clean: clean-pyc clean-build

isort-test: clean-pyc
isort \
--recursive \
--check-only \
--line-width $(LINTING_LINELENGTH) \
--multi-line 3 \
--trailing-comma \
$(PACKAGE_DIRECTORY) $(TEST_DIRECTORY)
$(PACKAGE_DIR) $(TEST_DIR)

isort: clean-pyc
isort \
--recursive \
--line-width $(LINTING_LINELENGTH) \
--multi-line 3 \
--trailing-comma \
$(PACKAGE_DIRECTORY) $(TEST_DIRECTORY)
$(PACKAGE_DIR) $(TEST_DIR)

darglint-test:
darglint --docstring-style google --strictness full $(PACKAGE_DIRECTORY) $(TEST_DIRECTORY)
darglint --docstring-style google --strictness full $(PACKAGE_DIR) $(TEST_DIR)

black-test:
black \
--check \
--include "^/($(PACKAGE_DIRECTORY)/|$(TEST_DIRECTORY)/).*\.pyi?" \
--include "^/($(PACKAGE_DIR)/|$(TEST_DIR)/).*\.pyi?" \
--line-length $(LINTING_LINELENGTH) \
.

black:
black \
--include "^/($(PACKAGE_DIRECTORY)/|$(TEST_DIRECTORY)/).*\.pyi?" \
--include "^/($(PACKAGE_DIR)/|$(TEST_DIR)/).*\.pyi?" \
--line-length $(LINTING_LINELENGTH) \
.

## stubs: Run stub generation.
stubs: clean-pyc
stubgen -o $(STUB_DIRECTORY) $(PACKAGE_DIRECTORY)
stubgen -o $(STUB_DIR) $(PACKAGE_DIR)

mypy-test:
mypy $(PACKAGE_DIRECTORY)
mypy $(PACKAGE_DIR)

pytest:
python -m pytest \
--verbose \
--color=yes \
--cov=$(PACKAGE_DIRECTORY) \
--cov-report term-missing
--cov=$(PACKAGE_DIR) \
--cov-report term-missing \
--cov-report xml

test: clean-pyc isort-test darglint-test black-test mypy-test pytest
## test: Run all tests.
test: clean-pyc isort-test darglint-test black-test pytest

## format: Apply all formatting tools.
format: clean-pyc isort black stubs

## profile: Run cProfile on the MCMC example and print longest running functions.
profile:
python -m cProfile -o .profile.txt "$(TEST_DIRECTORY)/example.py"
python -m cProfile -o .profile.txt "$(TEST_DIR)/example.py"
python -c "import pstats; s = pstats.Stats('.profile.txt'); print(s.strip_dirs().sort_stats('cumtime').print_stats(20))"

## tox: Run tox testing.
tox:
tox

## release: Build and upload to PyPI.
release: clean-pyc clean-build test
git config commit.gpgSign true
bumpversion $(bump)
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ including efficient beam sampling for the latent sequence resampling steps,
and multithreading when possible for parameter resampling.


## Warning

We now recommend using `PyMC` for Bayesian hidden Markov models. You can find some experimental
scripts in the `experiments` directory for doing this. Because `PyMC` is better maintained, you
can expect better support, and more flexibility in model specification. It may lack some
idiosyncratic sampling optimizations, but is probably a better choice for most users.


## Installation

The current version is development only, and installation is only recommended for
Expand Down
69 changes: 69 additions & 0 deletions experiments/truncated_dirichlet-pymc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python
"""Use PyMC 4.0 to demonstrate a truncated Dirichlet Process implementation.

This script has leans heavily on this fantastic tutorial: https://www.pymc.io/projects/examples/en/latest/mixture_models/dp_mix.html
Refer there for any questions.

PyMC is a well-developed and managed package, and we strongly suggest that users consider using this. For most practical applications,
a truncated Dirichlet Procees BHMM is almost surely sufficient, and PyMC 4 contains numerous optimizations for sampling etc. that
make it a very good choice.

This script could be made more efficient in PyMC4: you can use StickBreakingWeights directly, for example. However, it was not clear
how to resolve categorical priors; see here: https://discourse.pymc.io/t/using-a-categorical-prior-for-a-mixture-model/5394/6
"""

# weird import with numpy is required for theano tensor in pymc 4.0
import pymc as pm
import numpy
import numpy.distutils
numpy.distutils.__config__.blas_opt_info = numpy.distutils.__config__.blas_ilp64_opt_info
import theano.tensor as tt


# create some fake data with integers in range(N)
chains = [
[0, 1, 2, 3],
[0, 1, 2, 3, 0, 1, 2, 2, 0, 1, 3, 0]
]

# Define meta-parameters of BHMM
K = 8 # number of latent states
N = len(set(s for chain in chains for s in chain)) # number of observed states

with pm.Model(coords={"states": range(K), "observations": range(N), "obs_id": range(N)}) as model:
alpha = pm.Gamma("alpha", 2.0, 2.0)
gamma = pm.Gamma("gamma", 3.0, 3.0)
beta_emission = pm.Gamma("beta_emission", 1.0, 1.0)

# transition matrix: each row is a Dirichlet rv
w_transition = pm.StickBreakingWeights("beta_transition", alpha=alpha, K=K, dims="states") # only in PyMC 4
pi = [pm.Dirichlet(f"pi_{state}", gamma * w_transition, dims="states") for state in range(K)]

# emission matrix: another Dirichlet distribution
emissions = [pm.Dirichlet(f"emission_{state}", [beta_emission for _ in range(N)], dims="observations") for state in range(K)]

# now, create latent state chain
# states = [pm.Categorical("s0", p = [1 / N for _ in range(N)])]
# for t in range(1, T):
# # get transitions from expanding out, since PyMC cannot index by random variables
# # prob = pm.Deterministic(f"prob_{t}", [row * state for row, state in zip(pi, states[t-1])])
# # states.append(pm.Categorical(f"s{t}", p=prob))
# # states.append(pm.Categorical(f"s{t}", p=pi[states[t-1]]))
# states.append(pm.Categorical(f"s{t}", p=pi[states[t-1].eval()]))

# now, tie observations to latent states
# emissions = [pm.Categorical(f"e{t}", p=emissions[states[t].eval()], observed=chain) for t in range(T)]

# now, create latent state chain
chain_states = [None for _ in chains]
chain_emissions = [None for _ in chains]
for i, chain in enumerate(chains):
chain_states[i] = [pm.Categorical(f"s_{i}_0", p = [1 / N for _ in range(N)])]
for t in range(1, len(chain)):
chain_states[i].append(pm.Categorical(f"s_{i}_{t}", p=pi[chain_states[i][t-1]]))

# now, tie observations to latent states
chain_emissions[i] = [pm.Categorical(f"e_{i}_{t}", p=emissions[chain_states[i][t]], observed=chain[t]) for t in range(len(chain))]

# time for fitting
trace = pm.sample(500, tune=100, cores=1)
62 changes: 62 additions & 0 deletions experiments/truncated_dirichlet-pymc3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
"""Use PyMC3 to demonstrate a truncated Dirichlet Process implementation.

This script has leans heavily on this fantastic tutorial: https://www.pymc.io/projects/examples/en/latest/mixture_models/dp_mix.html
Refer there for any questions.

PyMC is a well-developed and managed package, and we strongly suggest that users consider using this. For most practical applications,
a truncated Dirichlet Procees BHMM is almost surely sufficient, and PyMC 4 contains numerous optimizations for sampling etc. that
make it a very good choice.

This script could be made more efficient in PyMC4: you can use StickBreakingWeights directly, for example. However, it was not clear
how to resolve categorical priors; see here: https://discourse.pymc.io/t/using-a-categorical-prior-for-a-mixture-model/5394/6
"""

import pymc3 as pm
import theano.tensor as tt


def stick_breaking(beta):
portion_remaining = tt.concatenate([[1], tt.extra_ops.cumprod(1 - beta)[:-1]])
return beta * portion_remaining

# create some fake data (real data may need to be made equal length, or use a more sophisticated model specification)
chains = [
[0, 1, 2, 3],
[0, 1, 2, 3, 0, 1, 2, 2, 0, 1, 3, 0]
]

# Define meta-parameters of BHMM
K = 6 # number of latent states
N = len(set(s for chain in chains for s in chain)) # number of observed states

model = pm.Model(coords={"states": range(K), "observations": range(N), "obs_id": range(N)})

with model:
alpha = pm.Gamma("alpha", 1.0, 1.0)
gamma = pm.Gamma("gamma", 1.0, 1.0)

# transition matrix: each row is a Dirichlet rv
beta_transition = pm.Beta("beta_transition", 3.0, alpha, dims="states")
w_transition = pm.Deterministic("w_transition", stick_breaking(beta_transition), dims="states")
pi = [pm.Dirichlet(f"pi_{state}", gamma * w_transition, dims="states") for state in range(K)]
# pi = tt.real(pi)

# emission matrix: another Dirichlet distribution
beta_emission = pm.Gamma("beta_emission", 1.0, 1.0)
emissions = [pm.Dirichlet(f"emission_{state}", [beta_emission for _ in range(N)], dims="observations") for state in range(K)]
# emissions = tt.real([tt.real(e) for e in emissions])

# now, create latent state chain
chain_states = [None for _ in chains]
chain_emissions = [None for _ in chains]
for i, chain in enumerate(chains):
chain_states[i] = [pm.Categorical(f"s_{i}_0", p = [1 / N for _ in range(N)])]
for t in range(1, len(chain)):
chain_states[i].append(pm.Categorical(f"s_{i}_{t}", p=tt.real(pi)[chain_states[i][t-1]]))
# now, tie observations to latent states
chain_emissions[i] = [pm.Categorical(f"e_{i}_{t}", p=tt.real(emissions)[chain_states[i][t]], observed=chain[t]) for t in range(len(chain))]

# time for fitting
trace = pm.sample(500, tune=100)

71 changes: 27 additions & 44 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,27 @@
atomicwrites==1.3.0
attrs==19.1.0
bleach==3.1.0
certifi==2019.9.11
chardet==3.0.4
coverage==4.5.4
darglint==1.1.0
docutils==0.15.2
filelock==3.0.12
idna==2.8
importlib-metadata==0.20
isort==4.3.21
more-itertools==7.2.0
mpmath==1.1.0
mypy==0.720
mypy-extensions==0.4.1
numpy==1.18.1
packaging==19.1
pkginfo==1.5.0.1
pluggy==0.13.1
py==1.8.0
Pygments==2.4.2
pyparsing==2.4.2
pytest==5.1.2
pytest-cov==2.7.1
readme-renderer==24.0
requests==2.22.0
requests-toolbelt==0.9.1
scipy==1.4.1
six==1.12.0
sympy==1.4
terminaltables==3.1.0
toml==0.10.0
tox==3.14.3
tox-travis==0.12
tqdm==4.35.0
twine==1.14.0
typed-ast==1.4.0
typing-extensions==3.7.4
urllib3==1.25.3
virtualenv==16.7.9
wcwidth==0.1.7
webencodings==0.5.1
zipp==0.6.0
attrs==22.1.0
coverage==6.4.4
darglint==1.8.1
distlib==0.3.5
filelock==3.8.0
iniconfig==1.1.1
mpmath==1.2.1
mypy==0.971
mypy-extensions==0.4.3
numpy==1.23.2
packaging==21.3
platformdirs==2.5.2
pluggy==1.0.0
py==1.11.0
pyparsing==3.0.9
pytest==7.1.2
pytest-cov==3.0.0
scipy==1.9.0
six==1.16.0
sympy==1.10.1
terminaltables==3.1.10
toml==0.10.2
tomli==2.0.1
tox==3.25.1
tqdm==4.64.0
typing_extensions==4.3.0
virtualenv==20.16.3
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
name="bayesian_hmm",
version="0.1.0",
license="MIT",
packages=setuptools.find_packages(exclude=["tests"]),
package_dir={"": "src"},
packages=setuptools.find_packages(where="src"),
author="James Ross",
author_email="[email protected]",
description="A non-parametric Bayesian approach to hidden Markov models",
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,10 @@ def single_variable_resample(scale: float, count: int, exact: bool) -> int:
An auxiliary variable, sampled with conditional distribution.

Raises:
err: If an error is raised while computing the approximate log likelihood that is not recognised, it is
RecursionError: If an error is raised while computing the approximate log likelihood that is not recognised, it is
passed on to the user.
OverflowError: If an error is raised while computing the approximate log likelihood that is not recognised, it is
passed on to the user.

"""
# TODO: check if we can simplify this
if count <= 0:
Expand Down
File renamed without changes.
10 changes: 3 additions & 7 deletions stubs/bayesian_hmm/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# Stubs for bayesian_hmm (Python 3)
#
# NOTE: This dynamically typed stub was automatically generated by stubgen.

from .chain import Chain
from .hdphmm import HDPHMM
from .variables import AggregateState, DirichletDistributionFamily, DirichletProcessFamily, HierarchicalDirichletProcess, MissingState, StartingState, State, StickBreakingProcess, Variable, hyperparameter
from .chain import Chain as Chain
from .hdphmm import HDPHMM as HDPHMM
from .variables import AggregateState as AggregateState, DirichletDistributionFamily as DirichletDistributionFamily, DirichletProcessFamily as DirichletProcessFamily, HierarchicalDirichletProcess as HierarchicalDirichletProcess, MissingState as MissingState, StartingState as StartingState, State as State, StickBreakingProcess as StickBreakingProcess, Variable as Variable, hyperparameter as hyperparameter
Loading