# Created by Matt Asher for statisticsblog.com # Feel free to share and modify so long as this header remains # Text file to use to generate transition probabilities sourceText = "/path/to/source.txt" # Should we write the result back to a text file? saveOutput = T # Output text file resultTxt = "/path/to/output.txt" # Number of characters for the fake text newTxtLength = 10^5 # Turn source file into a vector of individual characters txt = readChar(sourceText, file.info(sourceText)$size) txt = strsplit(txt, "", fixed=TRUE)[[1]] lenM1 = length(txt) - 1 # Get all the unique characters which appear in the source text uniqueChars = unique(txt) # Create main transition matrix tMat = diag(0, length(uniqueChars)) # Begin main iteratrion over vestor for(i in 1:lenM1) { curr = txt[i] upcoming = txt[(i+1)] rowIndex = which(uniqueChars==curr) colIndex = which(uniqueChars==upcoming) tMat[rowIndex, colIndex] = tMat[rowIndex, colIndex] + 1 } # Convert counts to frequencies (note, this will generally be a sparse matrix) tMat = tMat/sum(tMat) newTxt = rep(0, newTxtLength) # Seed character to begin fake text. You could also pick one randomly newTxt[1] = "\n" for(j in 2:newTxtLength) { # Look at the corresponding row of the matrix transitionsRowIndex = which(uniqueChars == newTxt[(j-1)]) tFreq = tMat[transitionsRowIndex,] # Pick a new character based on transition probabilities newTxt[j] = sample(uniqueChars, 1, prob=tFreq) } # Collapse it all into a single string newTxt = paste(newTxt, collapse="") if(saveOutput) { # Save this new text file fileConn<-file(resultTxt) writeLines(newTxt, fileConn) close(fileConn) }