// Show differences between 2 texts using Word-style richtext markup.

/*
This file provides the following function:

    string deltaText(string oldText, string newText, int oldFont, int newFont)

which returns a richtext string describing the differences between two
strings.  New words in newText are underlined, deleted words in oldText are scored through.

The bitmaps in oldFont and newFont indicate which fonts to use.
The significance of the bits are defined by the constants BOLD, ITAL, UNDL and STRK.

deltaText works by converting every different word and punctuation sequence
into a token ("T" followed by 4 digit number), and using regexps to match 
the strings of tokens. It searches for the longest possible matching sequence first,
and recurses on the remaining prefix and suffix.

*/

/*  Modification history:

    WHO     WHEN        WHAT
    jd      2 Oct 98    Creation.
    jd      5 Oct 98    Documentation update.
                        Efficiency improvement by passing on the size of the longest string matched so far.
    jd      9 Oct 98    Removed token limitation by using numbers instead.
*/


///////////////////////////////////////////////////////////
//  Constants

const int TOKSIZE = 4     // number of digits to use to count tokens

const int BOLD = 1
const int ITAL = 2
const int UNDL = 4
const int STRK = 8


///////////////////////////////////////////////////////////
//  State variables

Skip  wordSkip = createString       // mapping of tokens to index
Array wordList = create(100, 1)     // mapping of index to words
Skip  mapping  = create()           // mapping of old tokens to new tokens
int   numWords = 0                  // number of entries in wordList


///////////////////////////////////////////////////////////
//  Auxiliary functions

// convert index into token
string token(int i) {

    // convert integer into "T" followed by n 0's
    string tok = "000000000000" i ""
    return "T" tok[(length tok)-TOKSIZE:]

}


// convert string of words into string of tokens
string tokenizeText(string txt) {

    string sep = " \t\n(){}.,;:?!+-=\""
    Regexp hasWord = regexp "^([^" sep "]+)|([" sep "]+)"

    // scan text placing words into store
    string tokens = ""
    string restTxt = txt
    while ( hasWord restTxt && restTxt[match 0] != "") {
  
        string word = ""
        if ( restTxt[match 1] != "" )      word = restTxt[match 1]
        else if ( restTxt[match 2] != "" ) word = restTxt[match 2]

        if ( word == "" ) continue

        int idx
        if ( !find(wordSkip, word, idx) ) {

            // new word, so find a token for it, and store it
            put(wordList, word, numWords, 0)
            put(wordSkip, word, numWords)
            idx = numWords++
        }

        // add token to token string
        tokens  = tokens token(idx)

        // remove word
        restTxt = restTxt[(end 0)+1:]
    }

    return tokens
    
}


// make RTF string to select font
string rtfFont(int fnt) {

    string result = ""
    if ( (fnt & BOLD) != 0 ) result = result "\\b"
    if ( (fnt & ITAL) != 0 ) result = result "\\i"
    if ( (fnt & UNDL) != 0 ) result = result "\\ul"
    if ( (fnt & STRK) != 0 ) result = result "\\strike"

    return result
}


// translate token back into the associated word
string makeWord(string c, int i) {

    int d   = TOKSIZE+1
    int idx = intOf(c[i*d+1:(i+1)*d-1])
    return get(wordList, idx, 0)
}


// translate token back into the associated word with font marking
string makeMarkedWord(string c, int idx, int fnt) {

    string w = makeWord(c, idx)

    return "{" (rtfFont fnt) " " w "}"
}


// convert token string back into a word string
string unTokenizeText(string oldToks, newToks, int oldFont, newFont) {

    int d = TOKSIZE+1
    string result = ""
    int lastj = -1

    int oldLen = (length oldToks)/d
    int newLen = (length newToks)/d

    if ( length oldToks > 0 ) {

        // scan old tokens
        int i
        for i in 0:oldLen-1 do {
        
            int j = lastj
            if ( find(mapping, i, j) ) {
                // found corresponding new token
    
                // are there new tokens to insert before this?
                if ( j > lastj+1 ) {
                    // yes - insert them
                    int k
                    for k in lastj+1:j-1 do {
                        result = result makeMarkedWord(newToks, k, newFont)
                    }
                }
    
                // insert unchanged token
                result = result makeWord(oldToks, i)
    
            } else {
    
                // inserted deleted token
                result = result makeMarkedWord(oldToks, i, oldFont)
            }
            lastj = j
        }
    }
    
    // are there new tokens left to insert at the end?
    if ( lastj+1 < newLen ) {
        // yes - insert them
        int k
        for k in lastj+1:newLen-1 do {
            result = result makeMarkedWord(newToks, k, newFont)
        }
    }

    return result
}


// match two token strings
void matchStrings(string newText, oldText, int newDisp, oldDisp, matchLength) {

    // d = length of a token
    int d = TOKSIZE+1

    // find the length of the shorter string
    int lenNew = (length newText)/d
    int lenOld = (length oldText)/d
    int len = lenNew
    if( len > lenOld )      len = lenOld
    if( len > matchLength ) len = matchLength

    // do nothing if one string is empty
    if ( len == 0 ) return
    
    // start matching
    // we start with longest possible subsequence, and gradually reduce the size of the match
    bool matched = false
    int p
    int j = len
    while ( !matched && j > 0 ) {

        // start with subsequence at beginning of new text
        int i = 0
        while ( !matched && i <= lenNew - j ) {

            matched = matches(newText[i*d:(i+j)*d-1], oldText)

            // p records the start point of match (if there was one)
            p = i

            // increment the starting point of the subsequence
            i++
        }

        // reduce length of match
        j--
    }

    if ( matched ) {

        // record match in skip list
        int l = (start 0)/d
        int k
        for k in 0:j do {
            put(mapping, l+k+oldDisp, p+k+newDisp)
        }

        // recurse on remaining unmatched parts
        if ( p > 0 && l > 0 ) {
            matchStrings(newText[0:p*d-1], oldText[0:l*d-1], newDisp, oldDisp, j+1)
        }
        if ( p+j+1 < len && l+j+1 < len ) {
            matchStrings(newText[(p+j+1)*d:], oldText[(l+j+1)*d:], newDisp+p+j+1, oldDisp+l+j+1, j+1)
        }

    }
}


// match two token strings
void matchStrings(string newText, oldText, int newDisp, oldDisp) {
    // length of match is something big
    matchStrings(newText, oldText, newDisp, oldDisp, (length newText)+(length oldText))
}


///////////////////////////////////////////////////////////
//  Main function

string deltaText(string oldText, newText, int oldFont, newFont) {

    string oldToks = tokenizeText oldText
    string newToks = tokenizeText newText

    matchStrings(newToks, oldToks, 0, 0)
    
    return unTokenizeText(oldToks, newToks, oldFont, newFont)
}
oks, newToks, oldFont, ne