2013-11-18 9 views
0

Я хотел бы попросить о помощи в реализации скрытого подхода марков к назначению родословной на основе данных генотипа SNP. Учитывая, что у меня переход матрица генерируется как таковую:Внедрение алгоритма Витерби в HMM с изменяющимися матрицами излучения по маркетам геномики

states <- c("A1","A2","A3","A4","A5","A6","A7","A8") # Define the names of the states 
A1 <- c(0.9,0.1,0.1,0.1,0.1,0.1,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A1" 
A2 <- c(0.1,0.9,0.1,0.1,0.1,0.1,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A2" 
A3 <- c(0.1,0.1,0.9,0.1,0.1,0.1,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A3" 
A4 <- c(0.1,0.1,0.1,0.9,0.1,0.1,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A4" 
A5 <- c(0.1,0.1,0.1,0.1,0.9,0.1,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A5" 
A6 <- c(0.1,0.1,0.1,0.1,0.1,0.9,0.1,0.1) # Set the probabilities of switching states, where the previous state was "A6" 
A7 <- c(0.1,0.1,0.1,0.1,0.1,0.1,0.9,0.1) # Set the probabilities of switching states, where the previous state was "A7" 
A8 <- c(0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.9) # Set the probabilities of switching states, where the previous state was "A8" 
thetransitionmatrix <- matrix(c(A1,A2,A3,A4,A5,A6,A7,A8), 8, 8, byrow = TRUE) # Create an 8 x 8 matrix 
rownames(thetransitionmatrix) <- states 
colnames(thetransitionmatrix) <- states 
thetransitionmatrix # Print out the transition matrix 
    A1 A2 A3 A4 A5 A6 A7 A8 
A1 0.9 0.1 0.1 0.1 0.1 0.1 0.1 0.1 
A2 0.1 0.9 0.1 0.1 0.1 0.1 0.1 0.1 
A3 0.1 0.1 0.9 0.1 0.1 0.1 0.1 0.1 
A4 0.1 0.1 0.1 0.9 0.1 0.1 0.1 0.1 
A5 0.1 0.1 0.1 0.1 0.9 0.1 0.1 0.1 
A6 0.1 0.1 0.1 0.1 0.1 0.9 0.1 0.1 
A7 0.1 0.1 0.1 0.1 0.1 0.1 0.9 0.1 
A8 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.9 

и эмиссии матрица представляет собой список матриц 8x4 п, с п, равным числу ОНП/строк в данных. Например, учитывая следующие данные для 8 образцов (A1-A8) через 3 ОНП/строк:

A1 A2 A3 A4 A5 A6 A7 A8 
T T T T T T T C 
T C T T T T T C 
A A A G G A A A 

матрицы 1 в списке будет

A C G T 
A1 0 0 0 1/7 
A2 0 0 0 1/7 
A3 0 0 0 1/7 
A4 0 0 0 1/7 
A5 0 0 0 1/7 
A6 0 0 0 1/7 
A7 0 0 0 1/7 
A8 0 1 0 0 

с 7 образцов обладают Т в строке 1 каждый образец имеет вероятность 1/7. Поскольку только A8 обладает C, существует 100% вероятность присвоения C-A8. Для строки 3 вывод должен быть

A C G T 
A1 1/6 0 0 0 
A2 1/6 0 0 0 
A3 1/6 0 0 0 
A4 1/2 0 0 0 
A5 1/2 0 0 0 
A6 1/6 0 0 0 
A7 1/6 0 0 0 
A8 1/6 0 0 0 

Используя вышеупомянутую матрицу перехода и список матриц выбросов, Я хотел бы implment алгоритма Витерби на любой последовательности аллели. Код, который я в настоящее время не в состоянии использовать другую матрицу выбросов для каждой строки

viterbi <- function(sequence, transitionmatrix, emissionmatrix) 
    # This carries out the Viterbi algorithm. 
    # Adapted from "Applied Statistics for Bioinformatics using R" by Wim P. Krijnen,  page 209 
    # (cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf) 
    { 
# Get the names of the states in the HMM: 
states <- rownames(theemissionmatrix) 

# Make the Viterbi matrix v: 
v <- makeViterbimat(sequence, transitionmatrix, emissionmatrix) 
# Go through each of the rows of the matrix v (where each row represents 
# a position in the DNA sequence), and find out which column has the 
# maximum value for that row (where each column represents one state of 
# the HMM): 
mostprobablestatepath <- apply(v, 1, function(x) which.max(x)) 

# Print out the most probable state path: 
prevnucleotide <- sequence[1] 
prevmostprobablestate <- mostprobablestatepath[1] 
prevmostprobablestatename <- states[prevmostprobablestate] 
startpos <- 1 
for (i in 2:length(sequence)) 
{ 
    nucleotide <- sequence[i] 
    mostprobablestate <- mostprobablestatepath[i] 
    mostprobablestatename <- states[mostprobablestate] 
    if (mostprobablestatename != prevmostprobablestatename) 
    { 
     print(paste("Positions",startpos,"-",(i-1), "Most probable state = ", prevmostprobablestatename)) 
     startpos <- i 
    } 
    prevnucleotide <- nucleotide 
    prevmostprobablestatename <- mostprobablestatename 
} 
print(paste("Positions",startpos,"-",i, "Most probable state = ", prevmostprobablestatename)) 
    } 


# the viterbi() function requires a second function makeViterbimat(): 

makeViterbimat <- function(sequence, transitionmatrix, emissionmatrix) 
    # This makes the matrix v using the Viterbi algorithm. 
    # Adapted from "Applied Statistics for Bioinformatics using R" by Wim P. Krijnen, page 209 
    # (cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf) 
    { 
# Change the sequence to uppercase 
sequence <- toupper(sequence) 
# Find out how many states are in the HMM 
numstates <- dim(transitionmatrix)[1] 
# Make a matrix with as many rows as positions in the sequence, and as many 
# columns as states in the HMM 
v <- matrix(NA, nrow = length(sequence), ncol = dim(transitionmatrix)[1]) 
# Set the values in the first row of matrix v (representing the first position of the sequence) to 0 
v[1, ] <- 0 
# Set the value in the first row of matrix v, first column to 1 
v[1,1] <- 1 
# Fill in the matrix v: 
for (i in 2:length(sequence)) # For each position in the DNA sequence: 
{ 
    for (l in 1:numstates) # For each of the states of in the HMM: 
    { 
     # Find the probabilility, if we are in state l, of choosing the nucleotide at position in the sequence 
     statelprobnucleotidei <- emissionmatrix[l,sequence[i]] 

     # v[(i-1),] gives the values of v for the (i-1)th row of v, ie. the (i-1)th position in the sequence. 
     # In v[(i-1),] there are values of v at the (i-1)th row of the sequence for each possible state k. 
     # v[(i-1),k] gives the value of v at the (i-1)th row of the sequence for a particular state k. 

     # transitionmatrix[l,] gives the values in the lth row of the transition matrix, xx should not be transitionmatrix[,l]? 
     # probabilities of changing from a previous state k to a current state l. 

     # max(v[(i-1),] * transitionmatrix[l,]) is the maximum probability for the nucleotide observed 
     # at the previous position in the sequence in state k, followed by a transition from previous 
     # state k to current state l at the current nucleotide position. 

     # Set the value in matrix v for row i (nucleotide position i), column l (state l) to be: 
     v[i,l] <- statelprobnucleotidei * max(v[(i-1),] * transitionmatrix[,l]) 
    } 
} 
return(v) 
    } 

ответ

1

, что останавливает вас от просто давая функцию список предварительно вычисленных матриц выбросов, а не какой-то один?

makeViterbimat <- function(sequence, transitionmatrix, emissionmatrixList) 
    # This makes the matrix v using the Viterbi algorithm. 
    # Adapted from "Applied Statistics for Bioinformatics using R" by Wim P. Krijnen, page 209 
    # (cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf) 
    { 
# Change the sequence to uppercase 
sequence <- toupper(sequence) 
# Find out how many states are in the HMM 
numstates <- dim(transitionmatrix)[1] 
# Make a matrix with as many rows as positions in the sequence, and as many 
# columns as states in the HMM 
v <- matrix(NA, nrow = length(sequence), ncol = dim(transitionmatrix)[1]) 
# Set the values in the first row of matrix v (representing the first position of the sequence) to 0 
v[1, ] <- 0 
# Set the value in the first row of matrix v, first column to 1 
v[1,1] <- 1 
# Fill in the matrix v: 
for (i in 2:length(sequence)) # For each position in the DNA sequence: 
{ 
    emissionmatrix = emissionmatrixList[[i]] 
    for (l in 1:numstates) # For each of the states of in the HMM: 
    { 
     # Find the probabilility, if we are in state l, of choosing the nucleotide at position in the sequence 
     statelprobnucleotidei <- emissionmatrix[l,sequence[i]] 

     # v[(i-1),] gives the values of v for the (i-1)th row of v, ie. the (i-1)th position in the sequence. 
     # In v[(i-1),] there are values of v at the (i-1)th row of the sequence for each possible state k. 
     # v[(i-1),k] gives the value of v at the (i-1)th row of the sequence for a particular state k. 

     # transitionmatrix[l,] gives the values in the lth row of the transition matrix, xx should not be transitionmatrix[,l]? 
     # probabilities of changing from a previous state k to a current state l. 

     # max(v[(i-1),] * transitionmatrix[l,]) is the maximum probability for the nucleotide observed 
     # at the previous position in the sequence in state k, followed by a transition from previous 
     # state k to current state l at the current nucleotide position. 

     # Set the value in matrix v for row i (nucleotide position i), column l (state l) to be: 
     v[i,l] <- statelprobnucleotidei * max(v[(i-1),] * transitionmatrix[,l]) 
    } 
} 
return(v) 
    } 

Или ваша проблема в том, как построить этот список выбросов?

+0

Благодарим вас за ответ, я выяснил решение, похожее на то, что вы предложили, но вместо этого с помощью statelprobnucleotidei <- emissionmatrix [[i]] [l, sequence [i]] ", который обеспечивает тот же вывод. Еще раз спасибо. – user2895292

+0

@ user2895292 Надеюсь, вы либо примете этот ответ, либо разместите свой собственный ответ и примите его. –