Generating Markov transition matrix in Python

If you want to do it all in pandas, here is an approach that works for non numeric data:

import pandas as pd
transitions = ['A', 'B', 'B', 'C', 'B', 'A', 'D', 'D', 'A', 'B', 'A', 'D']

df = pd.DataFrame(transitions)

# create a new column with data shifted one space
df['shift'] = df[0].shift(-1)

# add a count column (for group by function)
df['count'] = 1

# groupby and then unstack, fill the zeros
trans_mat = df.groupby([0, 'shift']).count().unstack().fillna(0)

# normalise by occurences and save values to get transition matrix
trans_mat = trans_mat.div(trans_mat.sum(axis=1), axis=0).values

It's slower than the pure python approach but maybe worth it for flexibility and to avoid creating your own function.


This might give you some ideas:

transitions = ['A', 'B', 'B', 'C', 'B', 'A', 'D', 'D', 'A', 'B', 'A', 'D']

def rank(c):
    return ord(c) - ord('A')

T = [rank(c) for c in transitions]

#create matrix of zeros

M = [[0]*4 for _ in range(4)]

for (i,j) in zip(T,T[1:]):
    M[i][j] += 1

#now convert to probabilities:
for row in M:
    n = sum(row)
    if n > 0:
        row[:] = [f/sum(row) for f in row]

#print M:

for row in M:
    print(row)

output:

[0.0, 0.5, 0.0, 0.5]
[0.5, 0.25, 0.25, 0.0]
[0.0, 1.0, 0.0, 0.0]
[0.5, 0.0, 0.0, 0.5]

On Edit Here is a function which implements the above ideas:

#the following code takes a list such as
#[1,1,2,6,8,5,5,7,8,8,1,1,4,5,5,0,0,0,1,1,4,4,5,1,3,3,4,5,4,1,1]
#with states labeled as successive integers starting with 0
#and returns a transition matrix, M,
#where M[i][j] is the probability of transitioning from i to j

def transition_matrix(transitions):
    n = 1+ max(transitions) #number of states

    M = [[0]*n for _ in range(n)]

    for (i,j) in zip(transitions,transitions[1:]):
        M[i][j] += 1

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    return M

#test:

t = [1,1,2,6,8,5,5,7,8,8,1,1,4,5,5,0,0,0,1,1,4,4,5,1,3,3,4,5,4,1,1]
m = transition_matrix(t)
for row in m: print(' '.join('{0:.2f}'.format(x) for x in row))

Output:

0.67 0.33 0.00 0.00 0.00 0.00 0.00 0.00 0.00
0.00 0.50 0.12 0.12 0.25 0.00 0.00 0.00 0.00
0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00
0.00 0.00 0.00 0.50 0.50 0.00 0.00 0.00 0.00
0.00 0.20 0.00 0.00 0.20 0.60 0.00 0.00 0.00
0.17 0.17 0.00 0.00 0.17 0.33 0.00 0.17 0.00
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00
0.00 0.33 0.00 0.00 0.00 0.33 0.00 0.00 0.33