"""
Discretization module.
"""
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.optimize import minimize
from scipy.special import logsumexp
from bbmix.models import MixtureBinomial
##
def fit_binom(ad, dp, logratio=False, BIC=True):
"""
Fit a binomial distribution to ad, dp counts using MLE.
"""
p_mle = np.sum(ad) / np.sum(dp)
ad_th = dp * p_mle
d = {}
if logratio:
log_likelihood = np.sum(stats.binom.logpmf(ad, n=dp, p=p_mle))
log_likelihood_saturated = np.sum(np.log([ stats.binom.pmf(k, n, k/n) for n,k in zip(dp,ad) ]))
G2_stat = 2 * (log_likelihood_saturated - log_likelihood)
G2_p_value = stats.chi2.sf(G2_stat, df=dp.size-1)
d['G2_stat'] = G2_stat
d['G2_p_value'] = G2_p_value
elif BIC:
log_likelihood = np.sum(stats.binom.logpmf(ad, n=dp, p=p_mle))
d['BIC'] = 1 * np.log(dp.size) - 2 * log_likelihood
d['L'] = log_likelihood
return ad_th, d
##
def fit_nbinom(ad, dp, logratio=False, BIC=True):
"""
Fit a negative-binomial distribution to ad, dp counts using MLE.
"""
def _negbinom_log_likelihood(params, ad):
"""
Negative log-likelihood function for the negative binomial distribution.
params: list of [r, p], where r is the number of failures and p is the probability of success.
ad: observed allele depths.
dp: observed depths.
"""
r, p = params
if r <= 0 or not (0 < p < 1):
return np.inf
log_likelihood = np.sum(stats.nbinom.logpmf(ad, n=r, p=p))
return -log_likelihood
mean_ad = ad.mean()
var_ad = ad.var()
p_initial = mean_ad / var_ad if var_ad > mean_ad else 0.5
r_initial = mean_ad * (mean_ad / (var_ad - mean_ad)) if var_ad > mean_ad else 1
bounds = [(1e-5, None), (1e-5, 1-1e-5)] # r > 0 and 0 < p < 1
result = minimize(_negbinom_log_likelihood, x0=[r_initial, p_initial], args=(ad), bounds=bounds)
r_mle, p_mle = result.x
np.random.seed(1234)
ad_th = stats.nbinom.rvs(r_mle, p_mle, size=dp.size)
d = {}
if logratio:
log_likelihood = -_negbinom_log_likelihood([r_mle, p_mle], ad)
log_likelihood_saturated = np.sum(np.log([stats.nbinom.pmf(k, r_mle, (k+.0000001)/n) for n,k in zip(dp, ad)]))
G2_stat = 2 * (log_likelihood_saturated - log_likelihood)
G2_p_value = stats.chi2.sf(G2_stat, df=dp.size-1)
d['G2_stat'] = G2_stat
d['G2_p_value'] = G2_p_value
elif BIC:
log_likelihood = -_negbinom_log_likelihood([r_mle, p_mle], ad)
d['BIC'] = 2 * np.log(dp.size) - 2 * log_likelihood
d['L'] = log_likelihood
return ad_th, d
##
def fit_betabinom(ad, dp, logratio=False, BIC=True):
def _betabinom_log_likelihood(params, ad, dp):
"""
Negative log-likelihood function for the Beta-Binomial distribution.
params: list of [alpha, beta], where alpha and beta are the parameters of the Beta distribution.
ad: observed allele depths (number of successes).
dp: observed depths (total trials).
"""
alpha, beta = params
if alpha <= 0 or beta <= 0:
return np.inf
log_likelihood = np.sum(stats.betabinom.logpmf(ad, dp, alpha, beta))
return -log_likelihood
# Initial guess for alpha and beta
mean_ad = ad.mean()
var_ad = ad.var()
p_initial = mean_ad / dp.mean()
alpha_initial = p_initial * ((p_initial * (1 - p_initial)) / (var_ad / dp.mean() - 1) - 1)
beta_initial = (1 - p_initial) * ((p_initial * (1 - p_initial)) / (var_ad / dp.mean() - 1) - 1)
alpha_initial = max(alpha_initial, 1e-5)
beta_initial = max(beta_initial, 1e-5)
bounds = [(1e-5, None), (1e-5, None)]
result = minimize(_betabinom_log_likelihood, x0=[alpha_initial, beta_initial], args=(ad, dp), bounds=bounds)
alpha_mle, beta_mle = result.x
np.random.seed(1234)
ad_th = stats.betabinom.rvs(dp.astype(int), alpha_mle, beta_mle, size=dp.size)
d = {}
if logratio:
log_likelihood = -_betabinom_log_likelihood([alpha_mle, beta_mle], ad, dp)
log_likelihood_saturated = np.sum(np.log([stats.betabinom.pmf(k, n, alpha_mle, beta_mle) for n, k in zip(dp, ad)]))
G2_stat = 2 * (log_likelihood_saturated - log_likelihood)
G2_p_value = stats.chi2.sf(G2_stat, df=dp.size - 1)
d['G2_stat'] = G2_stat
d['G2_p_value'] = G2_p_value
elif BIC:
log_likelihood = -_betabinom_log_likelihood([alpha_mle, beta_mle], ad, dp)
d['L'] = log_likelihood
d['BIC'] = 2 * np.log(dp.size) - 2 * log_likelihood
return ad_th, d
##
def fit_mixbinom(ad, dp, logratio=False, BIC=True):
np.random.seed(1234)
model = MixtureBinomial(n_components=2, tor=1e-20)
params = model.fit((ad, dp), max_iters=500, early_stop=True)
ad_th = model.sample(dp)
d = {}
if logratio:
raise ValueError('Logratio test not implemented yet...')
if BIC:
d['L'] = model.log_likelihood_mixture_bin(ad, dp, params)
d['BIC'] = model.model_scores['BIC']
return ad_th, d
##
def get_posteriors(ad, dp):
np.random.seed(1234)
model = MixtureBinomial(n_components=2, tor=1e-20)
model.fit((ad, dp), max_iters=500, early_stop=True)
# Access the estimated parameters
ps = model.params[:2]
pis = model.params[2:]
idx1 = np.argmax(ps)
idx0 = 0 if idx1 == 1 else 1
p1 = ps[idx1]
p0 = ps[idx0]
pi1 = pis[idx1]
pi0 = pis[idx0]
d = {'p':[p0,p1], 'pi':[pi0,pi1]}
# Get posterior probabilities
log_likelihoods = np.zeros((ad.size, 2))
for k in range(2):
log_likelihoods[:,k] = model.log_likelihood_binomial(ad, dp, d['p'][k], d['pi'][k])
log_weighted_likelihoods = log_likelihoods + np.log(d['pi'])
log_likelihood_sums = logsumexp(log_weighted_likelihoods, axis=1, keepdims=True)
log_posterior_probs = log_weighted_likelihoods - log_likelihood_sums
posterior_probs = np.exp(log_posterior_probs)
return posterior_probs
##
[docs]
def genotype_mix(
ad: np.array,
dp: np.array,
t_prob: float = .7,
t_vanilla: float = 0,
debug: bool = False,
min_AD: int = 1
) -> np.array:
"""
Derive a discrete genotype (1:'MUT', 0:'WT') for each cell, given the
AD and DP counts of one of its candidate mitochondrial variants.
"""
positive_idx = np.where(dp>0)[0]
posterior_probs = get_posteriors(ad[positive_idx], dp[positive_idx])
tests = [
(posterior_probs[:,1]>t_prob) & (posterior_probs[:,0]<(1-t_prob)), # & (ad[positive_idx]>=min_AD), # REMOVE!!
(posterior_probs[:,1]<(1-t_prob)) & (posterior_probs[:,0]>t_prob)
]
geno_prob = np.select(tests, [1,0], default=0)
genotypes = np.zeros(ad.size, dtype=np.int16)
genotypes[positive_idx] = geno_prob
# Compare to vanilla genotyping (AF>t --> 1, 0 otherwise)
if debug:
test = (ad/dp>t_vanilla) & (ad>=min_AD)
geno_vanilla = np.where(test,1,0)
df = pd.DataFrame({
'ad':ad[positive_idx], 'dp':dp[positive_idx],
'geno_vanilla':geno_vanilla[positive_idx],
'geno_prob':geno_prob,
'p0':posterior_probs[:,0], 'p1':posterior_probs[:,1]
})
print(pd.crosstab(df['geno_vanilla'], df['geno_prob'], dropna=False))
return df
else:
return genotypes
##
def get_posteriors_and_params(ad, dp):
np.random.seed(1234)
model = MixtureBinomial(n_components=2, tor=1e-20)
model.fit((ad, dp), max_iters=500, early_stop=True)
ps = model.params[:2]
pis = model.params[2:]
idx1 = np.argmax(ps)
idx0 = 0 if idx1 == 1 else 1
p1 = ps[idx1]
p0 = ps[idx0]
pi1 = pis[idx1]
pi0 = pis[idx0]
return [p0,p1], [pi0,pi1]
##
def simulate_component_data(p, n_trials, n_samples):
return np.random.binomial(n=n_trials, p=p, size=n_samples)
##
def get_components(ad, dp):
ps, pis = get_posteriors_and_params(ad, dp)
p0, p1 = ps
pi0, pi1 = pis
n_samples = 10000
n_trials = int(np.mean(dp))
n_samples_0 = int(n_samples * pi0)
n_samples_1 = n_samples - n_samples_0
x_component_0 = simulate_component_data(p0, n_trials, n_samples_0)
x_component_1 = simulate_component_data(p1, n_trials, n_samples_1)
return x_component_0, x_component_1
##