"""
This file is part of the MasterMSM package.
"""
import warnings
import math
from functools import reduce, cmp_to_key
import itertools
import numpy as np
import networkx as nx
import scipy.linalg as spla
import matplotlib.pyplot as plt
import multiprocessing as mp
from ..msm import msm_lib
[docs]class SuperMSM(object):
""" A class for constructing the MSM
Attributes
----------
data : list
A list of instances of the TimeSeries class
keys : list of str
Names of states.
msms : dict
A dictionary containing MSMs for different lag times.
sym : bool
Whether we want to enforce symmetrization.
"""
def __init__(self, trajs, file_keys=None, keys=None, sym=False):
"""
Parameters
----------
trajs : list
A list of TimeSeries objects from the trajectory module
file_keys : str
A file from which the states are read. If not defined
keys are automatically generated from the time series.
sym : bool
Tells MSM whether to symmetrize count matrices or not.
"""
self.data = trajs
if isinstance(keys, list):
print ("Using the following keys", keys)
self.keys = keys
else:
try:
self.keys = list(map(lambda x: x.split()[0],
open(file_keys, "r").readlines()))
except TypeError:
self.keys = self._merge_trajs()
self.dt = self._max_dt()
self._out()
self.sym = sym
self.msms = {}
def _merge_trajs(self):
""" Merge all trajectories into a consistent set.
Returns
-------
new_keys : list
Combination of keys from all trajectories.
"""
new_keys = []
for traj in self.data:
new_keys += filter(lambda x: x not in new_keys, traj.keys)
return new_keys
def _max_dt(self):
""" Find maximum dt in trajectories.
Returns
-------
float
The maximum value of the time-step within the trajectories.
"""
return np.max([x.dt for x in self.data])
def _out(self):
""" Output description to user """
try:
print ("\n Building MSM from \n", [x.file_name for x in self.data])
except AttributeError:
pass
print (" # states: %g"%(len(self.keys)))
[docs] def do_msm(self, lagt, sliding=True):
""" Construct MSM for specific value of lag time.
Parameters
-----------
lagt : float
The lag time.
sliding : bool
Whether a sliding window is used in counts.
"""
self.msms[lagt] = MSM(self.data, keys=self.keys, lagt=lagt, sym=self.sym)
self.msms[lagt].do_count(sliding=sliding)
[docs] def convergence_test(self, save=None, N=1, sliding=True, \
error=True, time=None, out=None):
""" Carry out convergence test for relaxation times.
Parameters
-----------
N : int
The number of modes for which we are building the MSM.
sliding : bool
Whether a sliding window should be used or not.
error : bool
Whether to include errors or not.
time : array
The range of times that must be used.
save : bool, str
Whether we want to save to file.
"""
# print "\n Convergence test for the MSM: looking at implied timescales"
# defining lag times to produce the MSM
try:
assert(time is None)
lagtimes = self.dt*np.array([1] + range(10,100,10))
except AssertionError:
lagtimes = np.array(time)
# create MSMs at multiple lag times
for lagt in lagtimes:
if lagt not in self.msms.keys():
self.msms[lagt] = MSM(self.data, self.keys, lagt, sym=self.sym)
self.msms[lagt].do_count(sliding=sliding)
self.msms[lagt].do_trans()
if error:
self.msms[lagt].boots(nboots=50)
else:
if not hasattr(self.msms[lagt], 'tau_ave') and error:
self.msms[lagt].boots(nboots=50)
[docs] def ck_test(self, init=None, sliding=True, error=True, out=None, time=None):
""" Carry out Chapman-Kolmogorov test.
We follow the procedure described by Prinz et al.[1]_
Parameters
-----------
init : str
The states from which the relaxation should be simulated.
sliding : bool
Whether a sliding window is used to count transitions.
error : bool
Whether errors are calculated.
References
-----
.. [1] J.-H. Prinz et al, "Markov models of molecular kinetics: Generation
and validation", J. Chem. Phys. (2011).
"""
nkeys = len(self.keys)
# defining lag times to produce the MSM
try:
assert(time is None)
lagtimes = self.dt*np.array([1] + range(50,210,25))
except:
lagtimes = np.array(time)
# defining lag times to propopagate
#logs = np.linspace(np.log10(self.dt),np.log10(np.max(lagtimes)*5),20)
#lagtimes_exp = 10**logs
init_states = [x for x in range(nkeys) if self.keys[x] in init]
# create MSMs at multiple lag times
pMSM = []
for lagt in lagtimes:
if lagt not in self.msms.keys():
self.msms[lagt] = MSM(self.data, self.keys, lagt)
self.msms[lagt].do_count(sliding=sliding)
self.msms[lagt].do_trans()
# propagate transition matrix
lagtimes_exp = np.logspace(np.log10(lagt), np.log10(np.max(lagtimes)*10), num=50)
ltimes_exp_int = [int(lagt*math.floor(x/float(lagt))) \
for x in lagtimes_exp if x > lagt]
time, pop = self.msms[lagt].propagateT(init=init, time=ltimes_exp_int)
ltime = len(time)
# keep only selected states
pop_relax = np.zeros(ltime)
for x in init:
ind = self.msms[lagt].keep_keys.index(x)
#pop_relax += pop[:,ind]
pop_relax += np.array([x[ind] for x in pop])
pMSM.append((time, pop_relax))
# calculate population from simulation data
logs = np.linspace(np.log10(self.dt),np.log10(np.max(lagtimes)*10),10)
lagtimes_md = 10**logs
pMD = []
epMD = []
for lagt in lagtimes_md:
try:
num = np.sum([self.msms[lagt].count[j,i] for (j,i) in \
itertools.product(init_states,init_states)])
except KeyError:
self.msms[lagt] = MSM(self.data, self.keys, lagt)
self.msms[lagt].do_count(sliding=sliding)
num = np.sum([self.msms[lagt].count[j,i] for (j,i) in \
itertools.product(init_states,init_states)])
den = np.sum([self.msms[lagt].count[:,i] for i in init_states])
try:
pMD.append((lagt, float(num)/den))
except ZeroDivisionError:
print (" ZeroDivisionError: pMD.append((lagt, float(num)/den)) ")
print (" lagt",lagt)
print (" num", num)
print (" den", den)
sys.exit()
num = pMD[-1][1]-pMD[-1][1]**2
den = np.sum([self.msms[lagt].count[j,i] for (i,j) in \
itertools.product(init_states,init_states)])
epMD.append(np.sqrt(lagt/self.dt*num/den))
pMD = np.array(pMD)
epMD = np.array(epMD)
return pMSM, pMD, epMD
[docs] def do_lbrate(self, evecs=False, error=False):
""" Calculates the rate matrix using the lifetime based method.
We use the method described by Buchete and Hummer [1]_.
Parameters
----------
evecs : bool
Whether we want the left eigenvectors of the rate matrix.
error : bool
Whether to include errors or not.
References
-----
.. [1] N.-V. Buchete and G. Hummer, "Coarse master equations for
peptide folding dynamics", J. Phys. Chem. B (2008).
"""
nkeys = len(self.keys)
# define multiprocessing options
nproc = mp.cpu_count()
if len(self.data) < nproc:
nproc = len(self.data)
pool = mp.Pool(processes=nproc)
# generate multiprocessing input
sliding = True
mpinput = [[x.distraj, x.dt, self.keys, x.dt, sliding]
for x in self.data]
# run counting using multiprocessing
result = pool.map(msm_lib.calc_count_worker, mpinput)
pool.close()
pool.join()
# add up all independent counts
count = reduce(lambda x, y: np.matrix(x) + np.matrix(y), result)
# calculate length of visits
mpinput = [[x.distraj, x.dt, self.keys] for x in self.data]
# run counting using multiprocessing
pool = mp.Pool(processes=nproc)
result = pool.map(msm_lib.calc_lifetime, mpinput)
pool.close()
pool.join()
# reduce
life = np.zeros((nkeys), float)
for k in range(nkeys):
kk = self.keys[k]
visits = list(itertools.chain([x[kk] for x in result if kk in x.keys()]))
if len(visits) > 0:
life[k] = np.mean([item for lst in visits for item in lst])
self.life = life
# calculate lifetime based rates
lbrate = np.zeros((nkeys,nkeys), float)
for i in range(nkeys):
kk = self.keys[k]
ni = np.sum([count[x,i] for x in range(nkeys) if x != i])
if ni > 0:
for j in range(nkeys):
lbrate[j,i] = count[j,i]/(ni*life[i])
lbrate[i,i] = 0.
lbrate[i,i] = -np.sum(lbrate[:,i])
self.lbrate = lbrate
self.tauK, self.peqK, self.lvecsK, self.rvecsK = \
msm_lib.calc_eigsK(self.lbrate, evecs=True)
if error:
self.lbrate_error = self.lbrate/np.sqrt(count)
[docs]class MSM(object):
""" A class for constructing an MSM at a specific lag time.
Attributes
----------
keys : list of str
Names of states.
count : np array
Transition count matrix.
keep_states : list of str
Names of states after removing not strongly connected sets.
sym : bool
Whether we want to enforce symmetrization.
"""
def __init__(self, data=None, keys=None, lagt=None, sym=False):
"""
Parameters
----------
data : list
Set of trajectories used for the construction.
keys : list of str
Set of states in the model.
lag : float
Lag time for building the MSM.
sym : bool
Whether we want to enforce symmetrization.
"""
self.keys = keys
self.data = data
self.lagt = lagt
self.sym = sym
[docs] def do_count(self, sliding=True):
""" Calculates the count matrix.
Parameters
----------
sliding : bool
Whether a sliding window is used in counts.
"""
self.count = self.calc_count_multi(sliding=sliding)
if self.sym:
print (" symmetrizing")
self.count += self.count.transpose()
self.keep_states, self.keep_keys = self.check_connect()
[docs] def do_trans(self, evecs=False):
""" Wrapper for transition matrix calculation.
Also calculates its eigenvalues and right eigenvectors.
Parameters
----------
evecs : bool
Whether we want the left eigenvectors of the transition matrix.
"""
#print "\n Calculating transition matrix ..."
nkeep = len(self.keep_states)
keep_states = self.keep_states
count = self.count
self.trans = msm_lib.calc_trans(nkeep, keep_states, count)
if not evecs:
self.tauT, self.peqT = self.calc_eigsT()
else:
self.tauT, self.peqT, self.rvecsT, self.lvecsT = \
self.calc_eigsT(evecs=True)
[docs] def do_rate(self, method='Taylor', evecs=False, init=False, report=False):
""" Calculates the rate matrix from the transition matrix.
We use a method based on a Taylor expansion [1]_ or the maximum likelihood
propagator based (MLPB) method [2]_.
Parameters
----------
evecs : bool
Whether we want the left eigenvectors of the rate matrix.
method : str
Which method one wants to use. Acceptable values are 'Taylor'
and 'MLPB'.
init : array
Rate matrix to start optimization from.
report : bool
Whether to report the results from MC in MLPB.
References
-----
.. [1] D. De Sancho, J. Mittal and R. B. Best, "Folding kinetics
and unfolded state dynamics of the GB1 hairpin from molecular
simulation", J. Chem. Theory Comput. (2013).
.. [2] N.-V. Buchete and G. Hummer, "Coarse master equations for
peptide folding dynamics", J. Phys. Chem. B (2008).
"""
#print "\n Calculating rate matrix ..."
nkeep = len(self.keep_states)
if method == 'Taylor':
self.rate = msm_lib.calc_rate(nkeep, self.trans, self.lagt)
elif method == 'MLPB':
if isinstance(init, np.ndarray):
rate_init = init
elif init == 'random':
rate_init = np.random.rand(nkeep, nkeep)*1.e-2
else:
rate_init = msm_lib.calc_rate(nkeep, self.trans, self.lagt)
self.rate, ml, beta = msm_lib.calc_mlrate(nkeep, self.count, self.lagt, rate_init)
if report:
_, ax = plt.subplots(2,1, sharex=True)
ax[0].plot(ml)
ax[0].plot(np.ones(len(ml))*ml[0], '--')
ax[0].set_ylabel('-ln($L$)')
ax[1].plot(beta)
#ax[1].set_ylim(0,1.05)
ax[1].set_xlim(0,len(ml))
ax[1].set_xlabel('MC steps x n$_{freq}$')
ax[1].set_ylabel(r'1/$\beta$')
#print self.rate
if not evecs:
self.tauK, self.peqK = self.calc_eigsK()
else:
self.tauK, self.peqK, self.rvecsK, self.lvecsK = \
self.calc_eigsK(evecs=True)
[docs] def calc_count_multi(self, sliding=True, nproc=None):
""" Calculate transition count matrix in parallel
Parameters
----------
sliding : bool
Whether a sliding window is used in counts.
nproc : int
Number of processors to be used.
Returns
-------
array
The count matrix.
"""
# define multiprocessing options
if not nproc:
nproc = mp.cpu_count()
if len(self.data) < nproc:
nproc = len(self.data)
#print "\n ...running on %g processors"%nproc
elif nproc > mp.cpu_count():
nproc = mp.cpu_count()
pool = mp.Pool(processes=nproc)
# generate multiprocessing input
mpinput = [[x.distraj, x.dt, self.keys, self.lagt, sliding] \
for x in self.data]
# run counting using multiprocessing
result = pool.map(msm_lib.calc_count_worker, mpinput)
pool.close()
pool.join()
# add up all independent counts
count = reduce(lambda x, y: np.matrix(x) + np.matrix(y), result)
return np.array(count)
# def calc_count_seq(self, sliding=True):
# """ Calculate transition count matrix sequentially
#
# """
# keys = self.keys
# nkeys = len(keys)
# count = np.zeros([nkeys,nkeys], int)
#
# for traj in self.data:
# raw = traj.states
# dt = traj.dt
# if sliding:
# lag = 1 # every state is initial state
# else:
# lag = int(self.lagt/dt) # number of frames for lag time
# nraw = len(raw)
# for i in range(0, nraw-lag, lag):
# j = i + lag
# state_i = raw[i]
# state_j = raw[j]
# if state_i in keys:
# idx_i = keys.index(state_i)
# if state_j in keys:
# idx_j = keys.index(state_j)
# try:
# count[idx_j][idx_i] += 1
# except IndexError:
# pass
# return count
#
[docs] def check_connect(self):
""" Check connectivity of rate matrix.
Returns
-------
keep_states : list
Indexes of states from the largest strongly connected set.
keep_keys : list
Keys for the largest strongly connected set.
References
-----
We use the Tarjan algorithm as implemented in NetworkX. [1]_
.. [1] R. Tarjan, "Depth-first search and linear graph algorithms",
SIAM Journal of Computing (1972).
"""
D = nx.DiGraph(self.count)
keep_states = list(sorted(list(nx.strongly_connected_components(D)),
key = len, reverse=True)[0])
keep_states.sort()
# keep_states = sorted(nx.strongly_connected_components(D)[0])
keep_keys = list(map(lambda x: self.keys[x], keep_states))
return keep_states, keep_keys
[docs] def calc_eigsK(self, evecs=False):
""" Calculate eigenvalues and eigenvectors of rate matrix K
Parameters
-----------
evecs : bool
Whether we want the eigenvectors of the rate matrix.
Returns
-------
tauK : numpy array
Relaxation times from K.
peqK : numpy array
Equilibrium probabilities from K.
rvecsK : numpy array, optional
Right eigenvectors of K, sorted.
lvecsK : numpy array, optional
Left eigenvectors of K, sorted.
"""
evalsK, lvecsK, rvecsK = \
spla.eig(self.rate, left=True)
# sort modes
nkeep = len(self.keep_states)
elistK = []
for i in range(nkeep):
elistK.append([i,np.real(evalsK[i])])
elistK.sort(key=cmp_to_key(msm_lib.esort))
# calculate relaxation times from K and T
tauK = []
for i in range(1, nkeep):
iiK, lamK = elistK[i]
tauK.append(-1./lamK)
# equilibrium probabilities
ieqK, _ = elistK[0]
peqK_sum = reduce(lambda x, y: x + y, map(lambda x: rvecsK[x,ieqK],
range(nkeep)))
peqK = rvecsK[:,ieqK]/peqK_sum
if not evecs:
return tauK, peqK
else:
# sort eigenvectors
rvecsK_sorted = np.zeros((nkeep, nkeep), float)
lvecsK_sorted = np.zeros((nkeep, nkeep), float)
for i in range(nkeep):
iiK, lamK = elistK[i]
rvecsK_sorted[:,i] = rvecsK[:,iiK]
lvecsK_sorted[:,i] = lvecsK[:,iiK]
return tauK, peqK, rvecsK_sorted, lvecsK_sorted
[docs] def calc_eigsT(self, evecs=False):
"""
Calculate eigenvalues and eigenvectors of transition matrix T.
Parameters
-----------
evecs : bool
Whether we want the eigenvectors of the transition matrix.
Returns
-------
tauT : numpy array
Relaxation times from T.
peqT : numpy array
Equilibrium probabilities from T.
rvecsT : numpy array, optional
Right eigenvectors of T, sorted.
lvecsT : numpy array, optional
Left eigenvectors of T, sorted.
"""
#print "\n Calculating eigenvalues and eigenvectors of T"
evalsT, lvecsT, rvecsT = \
spla.eig(self.trans, left=True)
# sort modes
nkeep = len(self.keep_states)
elistT = []
for i in range(nkeep):
elistT.append([i,np.real(evalsT[i])])
#elistT.sort(key=msm_lib.esort)
elistT.sort(key=cmp_to_key(msm_lib.esort))
# calculate relaxation times
tauT = []
warnings.filterwarnings("ignore", category=RuntimeWarning)
for i in range(1, nkeep):
iiT, lamT = elistT[i]
tauT.append(-self.lagt/np.log(lamT))
# equilibrium probabilities
ieqT, _ = elistT[0]
peqT_sum = reduce(lambda x,y: x + y, map(lambda x: rvecsT[x,ieqT],
range(nkeep)))
peqT = rvecsT[:,ieqT]/peqT_sum
if not evecs:
return tauT, peqT
else:
# sort eigenvectors
rvecsT_sorted = np.zeros((nkeep, nkeep), float)
lvecsT_sorted = np.zeros((nkeep, nkeep), float)
for i in range(nkeep):
iiT, lamT = elistT[i]
rvecsT_sorted[:,i] = rvecsT[:,iiT]
lvecsT_sorted[:,i] = lvecsT[:,iiT]
return tauT, peqT, rvecsT_sorted, lvecsT_sorted
[docs] def boots(self, nboots=100, nproc=None, sliding=True):
""" Bootstrap the simulation data to calculate errors.
We generate count matrices with the same number of counts
as the original by bootstrapping the simulation data. The
time series of discrete states are chopped into chunks to
form a pool. From the pool we randomly select samples until
the number of counts reaches the original.
Parameters
----------
nboots : int
Number of bootstrap samples
nproc : int
Number of processors to use
"""
# generate trajectory list for easy handling
filetmp = msm_lib.traj_split(data=self.data, lagt=self.lagt)
# multiprocessing options
if not nproc:
nproc = mp.cpu_count()
#print " ...running on %g processors"%nproc
pool = mp.Pool(processes=nproc)
ncount = np.sum(self.count)
multi_boots_input = list(map(lambda x: [filetmp, self.keys, self.lagt, ncount,
sliding], range(nboots)))
# TODO: find more elegant way to pass arguments
result = pool.map(msm_lib.do_boots_worker, multi_boots_input)
pool.close()
pool.join()
tauT_boots = [x[0] for x in result]
peqT_boots = [x[1] for x in result]
keep_keys_boots = [x[3] for x in result]
self.peq_ave, self.peq_std = msm_lib.peq_averages(peqT_boots, keep_keys_boots, self.keys)
self.tau_ave, self.tau_std = msm_lib.tau_averages(tauT_boots, self.keys)
#self.trans_ave, self.trans_std = msm_lib.matrix_ave(trans_boots, keep_keys_boots, self.keys)
## def pcca(self, lagt=None, N=2, optim=False):
## """ Wrapper for carrying out PCCA clustering
##
## Parameters
## ----------
## lagt : float
## The lag time.
##
## N : int
## The number of clusters.
##
## optim : bool
## Whether optimization is desired.
##
## """
##
## return pcca.PCCA(parent=self, lagt=lagt, optim=optim)
#
# def rate_scale(self, iscale=None, scaling=1):
# """ Scaling columns of the rate matrix by a certain value
#
# Parameters
# ----------
# states : list
# The list of state keys we want to scale.
# val : float
# The scaling factor.
#
# """
# #print "\n scaling kiS by %g"%scaling
# nkeep = len(self.keep_keys)
# jscale = [self.keep_keys.index(x) for x in iscale]
# for j in jscale:
# for l in range(nkeep):
# self.rate[l,j] = self.rate[l,j]*scaling
## tauK, peqK, rvecsK_sorted, lvecsK_sorted = self.rate.calc_eigs(lagt=lagt, evecs=False)
#
#
[docs] def do_pfold(self, FF=None, UU=None, dot=False):
""" Wrapper to calculate reactive fluxes and committors
We use the Berzhkovskii-Hummer-Szabo method, J. Chem. Phys. (2009)
Parameters
----------
FF : list
Folded states.
UU : list
Unfolded states.
dot : string
Filename to output dot graph.
"""
#print "\n Calculating commitment probabilities and fluxes..."
_states = range(len(self.keep_states))
if isinstance(FF, list):
_FF = [self.keep_keys.index(x) for x in FF]
else:
_FF = [self.keep_keys.index(FF)]
if isinstance(UU, list):
_UU = [self.keep_keys.index(x) for x in UU]
else:
_UU = [self.keep_keys.index(UU)]
# do the calculation
self.J, self.pfold, self.sum_flux, self.kf = msm_lib.run_commit(_states, \
self.rate, self.peqK, _FF, _UU)
## # write graph in dot format
## if dot:
## D = nx.DiGraph(J)
### visual_lib.write_dot(D, nodeweight=self.peqT, edgeweight=self.trans, out="out.dot")
## visual_lib.write_dot(D, nodeweight=self.peqT, out="out.dot")
### visual_lib.write_dot(D, out="out.dot")
#
# def all_paths(self, FF=None, UU=None, out="graph.dot", cut=1):
# """ Enumerate all paths in network.
#
# Parameters
# ----------
# FF : list
# Folded states, currently limited to just one.
# UU : list
# Unfolded states, currently limited to just one.
#
# """
# nkeys = len(self.keep_keys)
# pfold = self.pfold
# J = self.J
# flux = self.sum_flux
#
# print "\n Enumerating all paths :\n"
# print " Total flux", flux
# if isinstance(FF, list):
# _FF = [self.keep_keys.index(x) for x in FF]
# else:
# _FF = [self.keep_keys.index(FF)]
# if isinstance(UU, list):
# _UU = [self.keep_keys.index(x) for x in UU]
# else:
# _UU = [self.keep_keys.index(UU)]
#
# # generate graph from flux matrix
# Jnode, Jpath = msm_lib.gen_path_lengths(range(nkeys), J, pfold, \
# flux, _FF, _UU)
# JpathG = nx.DiGraph(Jpath.transpose())
#
# # enumerate paths
# tot_flux = 0
# paths = []
# Jcum = np.zeros((nkeys, nkeys), float)
# paths_cum = {}
# p = 0
# for (j,i) in itertools.product(_FF,_UU):
# try:
# for path in nx.all_simple_paths(JpathG, i, j):
# print " Path :",path
# f = J[path[1],path[0]]
# #print " %2i -> %2i: %10.4e "%(path[0], path[1], J[path[1],path[0]])
# for i in range(2, len(path)):
# #print " %2i -> %2i: %10.4e %10.4e"%\
# # (path[i-1], path[i], J[path[i],path[i-1]], Jnode[path[i-1]])
# f *= J[path[i],path[i-1]]/Jnode[path[i-1]]
# tot_flux +=f
# #print " J(path) = %10.4e, %10.4e"%(f,tot_flux)
# paths_cum[p] = {}
# paths_cum[p]['path'] = path
# paths_cum[p]['flux'] = f
# p +=1
#
# except nx.NetworkXNoPath:
# #print " No path for %g -> %g\n Stopping here"%(i, j)
# pass
# # store flux in matrix
#
# print " Commulative flux: %10.4e"%tot_flux
#
# # sort paths based on flux
# sorted_paths = sorted(paths_cum.items(), key=operator.itemgetter(1))
# sorted_paths.reverse()
#
# # print paths up to a flux threshold
# cum = 0
# for k,v in sorted_paths:
# path = v['path']
# f = v['flux']
# cum += f
# for j in range(1, len(path)):
# i = j - 1
# Jcum[path[j],path[i]] = J[path[j],path[i]]
# if cum/tot_flux > cut:
# break
# print Jcum
# visual_lib.write_dot(Jcum, nodeweight=self.peqK, \
# rank=pfold, out=out)
#
# def do_dijkstra(self, FF=None, UU=None, cut=None, npath=None, out="graph.dot"):
# """ Obtaining the maximum flux path wrapping NetworkX's Dikjstra algorithm.
#
# Parameters
# ----------
# FF : list
# Folded states, currently limited to just one.
# UU : list
# Unfolded states, currently limited to just one.
# cut : float
# Percentage of flux to account for.
#
# """
# nkeys = len(self.keep_keys)
# pfold = self.pfold
# J = copy.deepcopy(self.J)
# print J
# flux = self.sum_flux
# print "\n Finding shortest path :\n"
# print " Total flux", flux
# if isinstance(FF, list):
# _FF = [self.keep_keys.index(x) for x in FF]
# else:
# _FF = [self.keep_keys.index(FF)]
# if isinstance(UU, list):
# _UU = [self.keep_keys.index(x) for x in UU]
# else:
# _UU = [self.keep_keys.index(UU)]
#
# # generate graph from flux matrix
# Jnode, Jpath = msm_lib.gen_path_lengths(range(nkeys), J, pfold, \
# flux, _FF, _UU)
# Jnode_init = Jnode
# JpathG = nx.DiGraph(Jpath.transpose())
#
# # find shortest paths
# Jcum = np.zeros((nkeys, nkeys), float)
# cum_paths = []
# n = 0
# tot_flux = 0
# while True:
# n +=1
# Jnode, Jpath = msm_lib.gen_path_lengths(range(nkeys), J, pfold, \
# flux, _FF, _UU)
# # generate nx graph from matrix
# JpathG = nx.DiGraph(Jpath.transpose())
# # find shortest path for sets of end states
# paths = []
# for (j,i) in itertools.product(_FF,_UU):
# try:
# path = nx.dijkstra_path(JpathG, i, j)
# pathlength = nx.dijkstra_path_length(JpathG, i, j)
# #print " shortest path:", path, pathlength
# paths.append(((j,i), path, pathlength))
# except nx.NetworkXNoPath:
# #print " No path for %g -> %g\n Stopping here"%(i, j)
# pass
#
# # sort maximum flux paths
# try:
# shortest_path = sorted(paths, key=itemgetter(2))[0]
# except IndexError:
# print " No paths"
# break
#
# # calculate contribution to flux
# sp = shortest_path[1]
# f = self.J[sp[1],sp[0]]
# path_fluxes = [f]
# print ' Path :', sp
# print "%2i -> %2i: %10.4e "%(sp[0], sp[1], self.J[sp[1],sp[0]])
# for j in range(2, len(sp)):
# i = j - 1
# print "%2i -> %2i: %10.4e %10.4e"%(sp[i], sp[j], \
# self.J[sp[j],sp[i]], Jnode_init[sp[i]])
# f *= self.J[sp[j],sp[i]]/Jnode_init[sp[i]]
# path_fluxes.append(J[sp[j],sp[i]])
#
# # find bottleneck
# ib = np.argmin(path_fluxes)
#
# # remove flux from edges
# for j in range(1,len(sp)):
# i = j - 1
# J[sp[j],sp[i]] -= f
# if J[sp[j],sp[i]] < 0:
# J[sp[j],sp[i]] = 0.
# J[sp[ib+1],sp[ib]] = 0. # bottleneck leftover flux = 0.
#
# # store flux in matrix
# for j in range(1, len(sp)):
# i = j - 1
# Jcum[sp[j],sp[i]] += f
#
# flux -= f
# tot_flux +=f
# cum_paths.append((shortest_path, f))
# print ' flux: %4.2e; ratio: %4.2e; leftover: %4.2e'%(f, f/self.sum_flux,flux/self.sum_flux)
# #print ' leftover flux: %10.4e\n'%flux
# if cut is not None:
# if flux/self.sum_flux < cut:
# break
#
# if npath is not None:
# if n == npath:
# break
#
# print "\n Commulative flux: %10.4e"%tot_flux
# print " Fraction: %10.4e"%(tot_flux/self.sum_flux)
## visual_lib.write_dot(Jcum, nodeweight=self.peqK, \
## rank=pfold, out=out)
#
# return cum_paths
#
#
[docs] def sensitivity(self, FF=None, UU=None, dot=False):
"""
Sensitivity analysis of the states in the network.
We use the procedure described by De Sancho, Kubas,
Blumberger and Best [1]_.
Parameters
----------
FF : list
Folded states.
UU : list
Unfolded states.
Returns
-------
dJ : list
Derivative of flux.
d_peq : list
Derivative of equilibrium populations.
d_kf : list
Derivative of global rate.
kf : float
Global rate.
References
-----
.. [1] D. De Sancho, A. Kubas, P.-H. Wang, J. Blumberger, R. B.
Best "Identification of mutational hot spots for substrate diffusion:
Application to myoglobin", J. Chem. Theory Comput. (2015).
"""
nkeep = len(self.keep_states)
K = self.rate
peqK = self.peqK
# calculate pfold
self.do_pfold(FF=FF, UU=UU)
pfold = self.pfold
sum_flux = self.sum_flux
kf = self.kf
if isinstance(FF, list):
_FF = [self.keep_keys.index(x) for x in FF]
else:
_FF = [self.keep_keys.index(FF)]
if isinstance(UU, list):
_UU = [self.keep_keys.index(x) for x in UU]
else:
_UU = [self.keep_keys.index(UU)]
pu = np.sum([peqK[x] for x in range(nkeep) if pfold[x] < 0.5])
dJ = []
d_pu = []
d_lnkf = []
for s in range(nkeep):
d_K = msm_lib.partial_rate(K, s)
d_peq = msm_lib.partial_peq(peqK, s)
d_pfold = msm_lib.partial_pfold(range(nkeep), K, d_K, _FF, _UU, s)
dJ.append(msm_lib.partial_flux(range(nkeep), peqK, K, pfold, \
d_peq, d_K, d_pfold, _FF))
d_pu.append(np.sum([d_peq[x] for x in range(nkeep) \
if pfold[x] < 0.5]))
d_lnkf.append((dJ[-1]*pu - sum_flux*d_pu[-1])/pu**2)
self.kf = kf
self.d_pu = d_pu
self.dJ = dJ
self.d_lnkf = d_lnkf/kf
[docs] def propagateK(self, p0=None, init=None, time=None):
""" Propagation of rate matrix using matrix exponential
Parameters
----------
p0 : string
Filename with initial population.
init : string
State(s) where the population is initialized.
time : list, array
User defined range of temperatures for propagating the dynamics.
Returns
-------
popul : array
Population of all states as a function of time.
pnorm : array
Population of all states as a function of time - normalized.
"""
# defining times for relaxation
try:
assert(time is None)
tmin = self.lagt
tmax = 1e4*self.lagt
logt = np.arange(np.log10(tmin), np.log10(tmax), 0.2)
time = 10**logt
except:
time = np.array(time)
nkeep = len(self.keep_states)
if p0 is not None:
try:
pini = np.array(p0)
except TypeError:
try:
#print " reading initial population from file: %s"%p0
pini = [float(y) for y in \
filter(lambda x: x.split()[0] not in ["#","@"],
open(p0, "r").readlines())]
except TypeError:
print (" p0 is not file")
print (" exiting here")
return
elif init is not None:
#print " initializing all population in states"
#print init
pini = [self.peqT[x] if self.keep_keys[x] in init else 0. for x in range(nkeep)]
# check normalization and size
if len(pini) != nkeep:
print (" initial population vector and state space have different sizes")
print (" stopping here")
return
else:
sum_pini = np.sum(pini)
pini_norm = [np.float(x)/sum_pini for x in pini]
# propagate rate matrix : parallel version
popul = []
for t in time:
x = [self.rate, t, pini_norm]
popul.append(msm_lib.propagate_worker(x))
#nproc = mp.cpu_count()
#pool = mp.Pool(processes=nproc)
#pool_input = [(self.rate, t, pini_norm) for t in time]
#popul = pool.map(msm_lib.propagate_worker, tuple(pool_input))
#pool.close()
#pool.join()
## normalize relaxation
#imax = np.argmax(ptot)
#maxpop = ptot[imax]
#imin = np.argmin(ptot)
#minpop = ptot[imin]
#if imax < imin:
# pnorm = map(lambda x: (x-minpop)/(maxpop-minpop), ptot)
#else:
# pnorm = map(lambda x: 1 - (x-minpop)/(maxpop-minpop), ptot)
return time, popul #popul #, pnorm
[docs] def propagateT(self, p0=None, init=None, time=None):
""" Propagation of transition matrix
Parameters
----------
p0 : string
Filename with initial population.
init : string
State(s) where the population is initialized.
time : list, int
User defined range of temperatures for propagating the dynamics.
Returns
-------
popul : array
Population of all states as a function of time.
pnorm : array
Population of all states as a function of time - normalized.
Notes
-----
There is probably just one essential difference between propagateT
and propagateK. We are obtaining the time evolution of the population
towards the equilibrium distribution. Using K, we can obtain the
population at any given time (P(t) = exp(Kt)P0), while here we are
limited to powers of the transition matrix (hence,
P(nt) = [T(t)**n]*P0).
"""
# defining times for relaxation
try:
assert(time is None)
tmin = self.lagt
tmax = 1e4*self.lagt
logt = np.arange(np.log10(tmin), np.log10(tmax), 0.2)
time = 10**logt
except:
time = np.array(time)
nkeep = len(self.keep_states)
if p0 is not None:
try:
pini = np.array(p0)
except TypeError:
try:
#print " reading initial population from file: %s"%p0
pini = [float(y) for y in \
filter(lambda x: x.split()[0] not in ["#","@"],
open(p0, "r").readlines())]
except TypeError:
print (" p0 is not file")
print (" exiting here")
return
elif init is not None:
#print " initializing all population in states"
#print init
pini = [self.peqT[x] if self.keep_keys[x] in init else 0. for x in range(nkeep)]
# check normalization and size
if len(pini) != nkeep:
print (" initial population vector and state space have different sizes")
print (" stopping here")
return
else:
sum_pini = np.sum(pini)
pini_norm = [np.float(x)/sum_pini for x in pini]
# propagate transition matrix : parallel version
popul = []
tcum = []
for t in time:
power = int(t/self.lagt)
tcum.append(self.lagt*power)
x = [self.trans, power, pini_norm]
popul.append(msm_lib.propagateT_worker(x))
#nproc = mp.cpu_count()
#pool = mp.Pool(processes=nproc)
#pool_input = [(self.rate, t, pini_norm) for t in time]
#popul = pool.map(msm_lib.propagate_worker, tuple(pool_input))
#pool.close()
#pool.join()
## normalize relaxation
#imax = np.argmax(ptot)
#maxpop = ptot[imax]
#imin = np.argmin(ptot)
#minpop = ptot[imin]
#if imax < imin:
# pnorm = map(lambda x: (x-minpop)/(maxpop-minpop), ptot)
#else:
# pnorm = map(lambda x: 1 - (x-minpop)/(maxpop-minpop), ptot)
return tcum, popul #popul #, pnorm
[docs] def acf_mode(self, modes=None):
""" Calculate mode autocorrelation functions.
We use the procedure described by Buchete and Hummer [1]_.
Parameters
----------
mode : int, list
Index(es) for sorted autocorrelation functions.
time : list, int
User defined range of temperatures for ACF.
Returns
-------
corr : array
The mode(s) autocorrelation function.
References
----------
.. [1] N.-V. Buchete and G. Hummer, "Coarse master equations for
peptide folding dynamics", J. Phys. Chem. B (2008).
"""
if not modes:
modes = range(1,len(self.keep_keys))
kk = self.keep_keys
trajs = [x.distraj for x in self.data]
acf_ave = {}
for m in modes:
x2 = []
acf_cum = []
for tr in trajs:
projected = []
for s in tr:
try:
projected.append(self.lvecsT[kk.index(s),m])
except ValueError:
pass
x2.extend([x**2 for x in projected])
acf = np.correlate(projected, projected, mode='full')
acf_half = acf[int(acf.size / 2):]
acf_cum.append(acf_half)
lmin = np.min([len(x) for x in acf_cum])
acf_ave[m] = []
for l in range(lmin):
num = np.sum([x[l] for x in acf_cum])
denom = np.sum([len(x)-l for x in acf_cum])
acf_ave[m].append(num/denom)
acf_ave[m] /=np.mean(x2)
return acf_ave