// Show differences between 2 texts using Word-style richtext markup. /* This file provides the following function: string deltaText(string oldText, string newText, int oldFont, int newFont) which returns a richtext string describing the differences between two strings. New words in newText are underlined, deleted words in oldText are scored through. The bitmaps in oldFont and newFont indicate which fonts to use. The significance of the bits are defined by the constants BOLD, ITAL, UNDL and STRK. deltaText works by converting every different word and punctuation sequence into a token ("T" followed by 4 digit number), and using regexps to match the strings of tokens. It searches for the longest possible matching sequence first, and recurses on the remaining prefix and suffix. */ /* Modification history: WHO WHEN WHAT jd 2 Oct 98 Creation. jd 5 Oct 98 Documentation update. Efficiency improvement by passing on the size of the longest string matched so far. jd 9 Oct 98 Removed token limitation by using numbers instead. */ /////////////////////////////////////////////////////////// // Constants const int TOKSIZE = 4 // number of digits to use to count tokens const int BOLD = 1 const int ITAL = 2 const int UNDL = 4 const int STRK = 8 /////////////////////////////////////////////////////////// // State variables Skip wordSkip = createString // mapping of tokens to index Array wordList = create(100, 1) // mapping of index to words Skip mapping = create() // mapping of old tokens to new tokens int numWords = 0 // number of entries in wordList /////////////////////////////////////////////////////////// // Auxiliary functions // convert index into token string token(int i) { // convert integer into "T" followed by n 0's string tok = "000000000000" i "" return "T" tok[(length tok)-TOKSIZE:] } // convert string of words into string of tokens string tokenizeText(string txt) { string sep = " \t\n(){}.,;:?!+-=\"" Regexp hasWord = regexp "^([^" sep "]+)|([" sep "]+)" // scan text placing words into store string tokens = "" string restTxt = txt while ( hasWord restTxt && restTxt[match 0] != "") { string word = "" if ( restTxt[match 1] != "" ) word = restTxt[match 1] else if ( restTxt[match 2] != "" ) word = restTxt[match 2] if ( word == "" ) continue int idx if ( !find(wordSkip, word, idx) ) { // new word, so find a token for it, and store it put(wordList, word, numWords, 0) put(wordSkip, word, numWords) idx = numWords++ } // add token to token string tokens = tokens token(idx) // remove word restTxt = restTxt[(end 0)+1:] } return tokens } // make RTF string to select font string rtfFont(int fnt) { string result = "" if ( (fnt & BOLD) != 0 ) result = result "\\b" if ( (fnt & ITAL) != 0 ) result = result "\\i" if ( (fnt & UNDL) != 0 ) result = result "\\ul" if ( (fnt & STRK) != 0 ) result = result "\\strike" return result } // translate token back into the associated word string makeWord(string c, int i) { int d = TOKSIZE+1 int idx = intOf(c[i*d+1:(i+1)*d-1]) return get(wordList, idx, 0) } // translate token back into the associated word with font marking string makeMarkedWord(string c, int idx, int fnt) { string w = makeWord(c, idx) return "{" (rtfFont fnt) " " w "}" } // convert token string back into a word string string unTokenizeText(string oldToks, newToks, int oldFont, newFont) { int d = TOKSIZE+1 string result = "" int lastj = -1 int oldLen = (length oldToks)/d int newLen = (length newToks)/d if ( length oldToks > 0 ) { // scan old tokens int i for i in 0:oldLen-1 do { int j = lastj if ( find(mapping, i, j) ) { // found corresponding new token // are there new tokens to insert before this? if ( j > lastj+1 ) { // yes - insert them int k for k in lastj+1:j-1 do { result = result makeMarkedWord(newToks, k, newFont) } } // insert unchanged token result = result makeWord(oldToks, i) } else { // inserted deleted token result = result makeMarkedWord(oldToks, i, oldFont) } lastj = j } } // are there new tokens left to insert at the end? if ( lastj+1 < newLen ) { // yes - insert them int k for k in lastj+1:newLen-1 do { result = result makeMarkedWord(newToks, k, newFont) } } return result } // match two token strings void matchStrings(string newText, oldText, int newDisp, oldDisp, matchLength) { // d = length of a token int d = TOKSIZE+1 // find the length of the shorter string int lenNew = (length newText)/d int lenOld = (length oldText)/d int len = lenNew if( len > lenOld ) len = lenOld if( len > matchLength ) len = matchLength // do nothing if one string is empty if ( len == 0 ) return // start matching // we start with longest possible subsequence, and gradually reduce the size of the match bool matched = false int p int j = len while ( !matched && j > 0 ) { // start with subsequence at beginning of new text int i = 0 while ( !matched && i <= lenNew - j ) { matched = matches(newText[i*d:(i+j)*d-1], oldText) // p records the start point of match (if there was one) p = i // increment the starting point of the subsequence i++ } // reduce length of match j-- } if ( matched ) { // record match in skip list int l = (start 0)/d int k for k in 0:j do { put(mapping, l+k+oldDisp, p+k+newDisp) } // recurse on remaining unmatched parts if ( p > 0 && l > 0 ) { matchStrings(newText[0:p*d-1], oldText[0:l*d-1], newDisp, oldDisp, j+1) } if ( p+j+1 < len && l+j+1 < len ) { matchStrings(newText[(p+j+1)*d:], oldText[(l+j+1)*d:], newDisp+p+j+1, oldDisp+l+j+1, j+1) } } } // match two token strings void matchStrings(string newText, oldText, int newDisp, oldDisp) { // length of match is something big matchStrings(newText, oldText, newDisp, oldDisp, (length newText)+(length oldText)) } /////////////////////////////////////////////////////////// // Main function string deltaText(string oldText, newText, int oldFont, newFont) { string oldToks = tokenizeText oldText string newToks = tokenizeText newText matchStrings(newToks, oldToks, 0, 0) return unTokenizeText(oldToks, newToks, oldFont, newFont) } oks, newToks, oldFont, ne