Jump to content

User:PerfektesChaos/WikidiffLX/coding/WikidiffLX.cpp

From mediawiki.org

Based on wikidiff2/wikidiff2.cpp (rev 83886 Mar 2011).

Declarations in WikidiffLX.h


  • Renamed following the new generation name
  • Added many methods


[edit]
/**
 * Diff formatter, based on code by Steinar H. Gunderson,
 * converted to work with the Dairiki diff engine by Tim Starling (Wikidiff2)
 *
 * Extended for WikidiffLX by PerfektesChaos@de.wikipedia 2011
 * GPL.
 */

#include "WikidiffLX.h"
#include <string>
#include <thai/thailib.h>
#include <thai/thwchar.h>
#include <thai/thbrk.h>

#include <stdio.h>



#define SPAN_INLINE "<span class=\"diffchange diffchange-inline\">"
#define TD_LEFT "<tr>\n  <td colspan=\"2\">&nbsp;"
#define TD_RIGHT "</td>\n" \
                 "  <td class=\"diff-marker\">+</td>\n" \
                 "  <td class=\"diff-addedline\">"
#define TR_LEFT "<tr>\n  <td class=\"diff-marker\">&#x2212;</td>\n" \
                "   <td class=\"diff-deletedline\">"


public

[edit]

execute()

[edit]

Based on wikidiff2 with following changes:

  • merged with former diffLines()
  • delegating HTML output to printLines()

Major task controlling line separation, diff engine, postprocessing engine result, formatting HTML.

const WikidiffLX::String & WikidiffLX::execute(const String & text1,
                                               const String & text2,
                                               int numContextLines)
{
   numberContextL = numContextLines;
   // Allocate some result space to avoid excessive copying
   result.clear();
   result.reserve(text1.size() + text2.size() + MAX_DIFF_LINE);

   // Split input strings into lines
   LineVector lines1;
   LineVector lines2;
   explodeLines(text1, lines1);
   explodeLines(text2, lines2);

   // Do the diff
   LineDiff linediff(lines1, lines2);

   // Transfer engine result into lines
   diffResult(linediff);

   // Turn whitespace only differences into changes
   diffWhitespace(lines1);

   // Adjust virtual line segmentation
   recoverParagraphs(lines1, lines2);

   // Format line differences as HTML
   printLines(lines1, lines2);

   // Return a reference to the result buffer
   return result;
}   // execute()


private

[edit]

diffResult()

[edit]

New methodology for postprocessing of virtual lines. Result of engine is transferred from LineDiff into Line objects. Note that the two LineVector collections are accessed by their pointers in DiffOp.

The block structure of combined op codes in LineDiff is part of the engine algorithm, located deep inside. That should not be changed. It would affect Word presentation as well as possible exchange of the engine. The Diff object does not permit array manipulation. The least confusing way is to transfer the engine result into the Line objects.

void WikidiffLX::diffResult(LineDiff diffs)
{
   // Transfer engine result into Line objects
   const size_t nDiff = diffs.size();
   size_t nBoth;
   size_t nFrom;
   size_t nTo;
   size_t j;
   DiffOp<Line> *diff;
   LinePVector fv;
   LinePVector tv;
   Line *f;
   Line *t;
   for (size_t i = 0; i < nDiff; i++) {
      diff = &diffs[i];
      switch (diff->op) {
         case DiffOp<Line>::copy :
         case DiffOp<Line>::change :
            fv = diff->from;
            tv = diff->to;
            nFrom = fv.size();
            nTo = tv.size();
            nBoth = std::min(nFrom, nTo);
            for (j = 0; j < nBoth; j++) {
               f = (Line *)fv[j];
               t = (Line *)tv[j];
               f->set_diff(diff->op, t);
               t->set_diff(diff->op, f);
            }   // for j
            for (j = nBoth; j < nFrom; j++) {
               f = (Line *)fv[j];
               f->set_diff(DiffOp<Line>::del);
            }   // for j
            for (j = nBoth; j < nTo; j++) {
               t = (Line *)tv[j];
               t->set_diff(DiffOp<Line>::add);
            }   // for j
            break;
         case DiffOp<Line>::del :
            fv = diff->from;
            nFrom = fv.size();
            for (j = 0; j < nFrom; j++) {
               f = (Line *)fv[j];
               f->set_diff(DiffOp<Line>::del);
            }   // for j
            break;
         case DiffOp<Line>::add :
            tv = diff->to;
            nTo = tv.size();
            for (j = 0; j < nTo; j++) {
               t = (Line *)tv[j];
               t->set_diff(DiffOp<Line>::add);
            }   // for j
            break;
      }   // switch .op
   }   // for i
}   // diffResult()

diffWhitespace()

[edit]

New methodology for trailing whitespace at end of line and invisible trailing lines. Major part of work is done within Line objects.

Note that this is called once for from only but does affect the to counterparts also.

void WikidiffLX::diffWhitespace(LineVector from)
{
   // detect whitespace only differences and translate into changes
   const size_t n = from.size();
   for (size_t i = 0; i < n; i++) {
      from[i].whitespaceOnly();
   }   // for i
}   // diffWhitespace()

explodeLines()

[edit]

Based on explodeLines() in wikidiff2 but recognizing breaks of virtual lines. Detect invisible trailing lines, separate from lines with visible content and hide them in Line objects.

Usage of allocated String for line text replaced by iterators.

void WikidiffLX::explodeLines(const String & text,
                              LineVector & lines)
{
   const Iterator pE = text.end();
   Iterator pB = text.begin();
   Iterator p;
   Iterator pH;
   Iterator pV;
   Iterator pS;
   Iterator pT;
   unsigned char b;
#ifndef NO_LINE_NUMBERS
   size_t k = 1;
   size_t keep = 1;
#endif
   while (pB != pE) {
      pH = std::find(pB, pE, '\n');
      while (pH - pB > LINELENGTH_VIRTUAL) {
         pT = pH - LINELENGTH_MIN;
         findVirtualBreak(pB + LINELENGTH_MIN, pT, &pV);
         if (pV == pT) {   // not found
            break;   // while
         } else {
            pS = pV;
            while (pS != pE) {
               b = (unsigned char)*pS;
               if (b == 0x20) {   // or other invisibles
                  pS++;
               } else {
                  break;   // while
               }
            }   // while
            if (pH - pS > LINELENGTH_MIN) {
#ifdef NO_LINE_NUMBERS
               lines.push_back(Line(pB, pV, pS));
#else
               lines.push_back(Line(pB, pV, pS, keep));
#endif
               pB = pS;
            } else {
               break;   // while
            }
         }
      }   // while  pH - pB > LINELENGTH_VIRTUAL
      pS = pH;
      p = pH;
      while (p > pB) {
         p--;
         b = (unsigned char)*p;
         if (b == 0x20) {   // or other invisibles
            pS = p;
         } else {
            break;   // while
         }
      }   // while suffix
      if (pH == pE) {
         pT = pE;
      } else {
         pT = pH;
         p = pT;
         while (true) {
            if (p == pE) {
               if (pH == pB) {
                  pH = pE;
               }
               pT = pE;
               break;   // while
            } else {
               b = (unsigned char)*p;
               if (b > 0x20) {   // may be other invisibles (Unicode)
                  break;   // while
               } else if (b == 0x0A) {
#ifndef NO_LINE_NUMBERS
                  if (pT != pH) {
                     k++;
                  }
#endif
                  pT = p + 1;
               }
            }
            p++;
         }   // while trailing
      }
#ifdef NO_LINE_NUMBERS
      lines.push_back(Line(pB, pS, pH, pT));
#else
      lines.push_back(Line(pB, pS, pH, pT, keep));
      k++;
      keep = k;
#endif
      pB = pT;
      if (pT != pE) {
         pT++;
      }
   }   // while  pB != pE
}   // explodeLines()

explodeWords()

[edit]

Based on explodeWords() in wikidiff2 but completely rewritten and accelerated. Handling of Unicode spaces introduced. Thai processing started only if really Thai character. Every CJK symbol is treated as single word now.


Not yet included:


void WikidiffLX::explodeWords(const String & text,
                              WordVector &words)
{
   // Split a string into words
   const Iterator pE = text.end();
   Iterator p = text.begin();
   // Don't try to do a word-level diff on very long lines
   if (text.size() > MAX_DIFF_LINE) {
      words.push_back(Word(p, pE, pE));
      return;
   }
   Iterator chStart;
   Iterator suffixStart;
   Iterator wordStart;
   unsigned int ch;
   unsigned char b;
   bool livingSuffix = false;
   bool livingWord = false;
   bool locateSuffix;
   while (p < pE) {
      chStart = p;
      locateSuffix = false;
      b = (unsigned char)*p;
      if (b < 0xC0) {   // ASCII or undefined
         if (b < 0x21) {
            locateSuffix = (b == 0x20 || 0x09);   // some more ...
         }
      } else {   // UTF range
         ch = nextUtf8Char(p, b, pE);
         if (ch > 0x0E00) {   // could be interesting
            if (ch > 0x0E00 && ch <= 0x0E5F) {   // Thai
               explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words);
               wordStart = chStart;
               explodeWordsThai(ch, p, pE, words, wordStart);
            } else if (ch >= 0x2002 && ch <= 0x200A) {   // spaces
               locateSuffix = true;
            } else if (ch >= 0x1100 && ch <= 0x11FF ||
                       ch >= 0x2E80 && ch <= 0x9FFF ||
                       ch >= 0xAC00 && ch <= 0xD7AF ||
                       ch >= 0x20000 && ch <= 0x2FA1F) {   // CJK
               explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words);
               wordStart = chStart;
               words.push_back(Word(chStart, p, p));
            }
         }
      }
      if (locateSuffix) {
         if (! livingSuffix) {
            suffixStart = chStart;
            livingSuffix = true;
         }
      } else if (livingSuffix) {
         explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, p, words);
      } else if (! livingWord) {
         wordStart = chStart;
         livingWord = true;
      }
      ++p;
   }   // while (p < pE)
   explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, pE, words);
}   // explodeWords()
void WikidiffLX::explodeWordSuffix(bool & livingWord,
                                   const Iterator wordStart,
                                   bool & livingSuffix,
                                   const Iterator suffixStart,
                                   const Iterator wordNext,
                                   WordVector & words)
{
   // Add word body and suffix interval to words, if any; reset living
   if (livingWord) {   // regular word pending
      if (wordStart < wordNext) {
         words.push_back(Word(wordStart, (livingSuffix?suffixStart:wordNext), wordNext));
      }
   } else if (livingSuffix) {   // append to previous CJK or Thai word
      if (! words.empty()) {   // not the beginning of the line
         words.pop_back();
         words.push_back(Word(wordStart, suffixStart, wordNext));
      }
   }
   livingWord = false;
   livingSuffix = false;
}   // explodeWordSuffix()

explodeWordsThai()

[edit]

Re-written procedure for handling of substring containing thai characters only in Thai line splitting into words within explodeWords().

void WikidiffLX::explodeWordsThai(unsigned int ch,
                                  Iterator & p,
                                  const Iterator pE,
                                  WordVector & words,
                                  Iterator & wordStart)
{
   // Pointing on thai character sequence and add particular thai words.
   // Thai chars are in U+0E00...0E5F each represented by three bytes UTF8.
   IntVector thaiBreakPositions;
   String tisText;
   Iterator pT = p;
   Iterator p0 = wordStart;
   int nBreaks;
   size_t max = (pE - p) / 3 + 1;
   unsigned char b;
   tisText.reserve(max);
   tisText = th_uni2tis(ch);
   while (pT < pE) {
      b = (unsigned char)*pT;
      if (b == 0xC0) {
         ch = nextUtf8Char(pT, b, pE);
         if (ch > 0x0E00 && ch <= 0x0E5F) {
            tisText += th_uni2tis(ch);
            p = pT;
            pT++;
         } else {   // not in Thai range
            break;   // while
         }
      } else {   // out of relevant UTF8
         break;   // while
      }
   }   // while (pT < pE)
   tisText += '\0';
   max = tisText.size();
   thaiBreakPositions.resize(max);
   nBreaks = th_brk((const thchar_t*)(tisText.data()), &thaiBreakPositions[0], max);
   for (int i = 0; i < nBreaks; i++) {
      pT = p0 + 3 * thaiBreakPositions[i];
      words.push_back(Word(wordStart, pT, pT));
      wordStart = pT;
   }   // for i
}   // explodeWordsThai()

findVirtualBreak()

[edit]

New method for detection of breaks of virtual lines within explodeLines().

void WikidiffLX::findPunctSpace(const Iterator pB,
                                const Iterator pE,
                                const unsigned char punct,
                                Iterator * p)
{
   // Detect punct+space; return point after punct or pE
   Iterator pR = pE;
   unsigned char b;
   *p = std::find(pB, pE, punct);
   while (*p != pE) {
      *p++;
      if (*p != pE) {
         b = (unsigned char)**p;
         if (b == 0x20) {
            break;   // while
         } else {
            *p = std::find(*p, pE, punct);
         }
      }
   }
}   // findPunctSpace()
void WikidiffLX::findVirtualBreak(const Iterator pB,
                                  const Iterator pE,
                                  Iterator * p)
{
   // Detect virtual line break; return point after terminator
   *p = pE;
   findPunctSpace(pB, *p, '.', p);
   findPunctSpace(pB, *p, '?', p);
   findPunctSpace(pB, *p, '!', p);
   /*
     Other candidates, requiring UTF8 decoding:
     * U+3002 CJK period
    */
}   // findVirtualBreak()

nextUtf8Char()

[edit]

Based on wikidiff2 but demanding 32bit characters explicitly.

unsigned int WikidiffLX::nextUtf8Char(Iterator & p,
                                      unsigned char b,
                                      const Iterator pE)
{
   // Weak UTF-8 decoder
   // Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
   // but Mediawiki never provides bad UTF-8 texts ...
   short seqLength = 0;
   unsigned int c = 0;
   while (p < pE) {
      if (b < 0x80) {
         c = b;
         seqLength = 0;
      } else if (b >= 0xC0) {
         // Start of UTF-8 character
         // If this is unexpected, due to an overshort sequence,
         // we ignore the invalid sequence and resynchronise here
         if (b < 0xE0) {
            seqLength = 1;
            c = b & 0x1F;
         } else if (b < 0xF0) {
            seqLength = 2;
            c = b & 0x0F;
         } else {
            seqLength = 3;
            c = b & 7;
         }
      } else if (seqLength) {
         c <<= 6;
         c |= b & 0x3F;
         --seqLength;
      } else {
         // Unexpected continuation, ignore
      }
      ++p;
      if (! seqLength) {
         break;   // while
      }
   }   // while (p < pE)
   return c;
}   // nextUtf8Char()

printLines()

[edit]

Mainly new code for the presentation of paragraphs. The respective diffLines() procedure in wikidiff2 is replaced by printLines(). Handling of virtual line sequences is implemented by filling table cells until hard break encountered. Trailing invisible lines are discovered and presented as table cells where necessary.

void WikidiffLX::printLines(const LineVector from,
                            const LineVector to)
{
   // Print all diff into result as HTML
   // linediff is adjusted with hard breaks at .op borders
   // .op=change lines are merged as single paragraphs
   const size_t nFrom = from.size();
   const size_t nTo = to.size();
   size_t iFrom = 0;
   size_t iTo = 0;
   bool leap = from[0].is_Copy() && to[0].is_Copy();
   while (iFrom < nFrom && iTo < nTo) {
      if (leap) {
         printLinesContext(from, to, nFrom, nTo, &iFrom, &iTo);
      } else {
         printLinesDiff(from, to, nFrom, nTo, &iFrom, &iTo);
      }
      leap = ! leap;
   }   // while
}   // printLines()
void WikidiffLX::printLinesContext(const LineVector from,
                                   const LineVector to,
                                   const size_t nFrom,
                                   const size_t nTo,
                                   size_t * iFrom,
                                   size_t * iTo)
{
   // Print unchanged lines as context before / after diff
   // *iFrom,*iTo changing from start of copy block to end of copy block
   const size_t jFrom = *iFrom;   // end of previous differing
   const size_t jTo = *iTo;   // end with following differing
   size_t k;
   bool latest = true;   // last block at all
   for ( ; *iFrom < nFrom; *iFrom++) {
      if (! from[*iFrom].is_Copy()) {
         latest = false;
         break;   // for *iFrom
      }
   }   // for *iFrom
   for ( ; *iTo < nTo; *iTo++) {
      if (! to[*iTo].is_Copy()) {
         latest = false;
         break;   // for *iTo
      }
   }   // for *iTo
   if (jFrom || jTo) {   // pending trailer required
      k = jFrom + numberContextL;
      if (latest) {
         printLinesContextRows(from, *iFrom, std::min(k, nFrom));
      } else {   // differing block follows
         if (*iFrom - jFrom > 2 * numberContextL + 1) {   // separate
            printLinesContextRows(from, jFrom, jFrom + numberContextL);
            k = *iFrom - numberContextL;
#ifndef NO_LINE_NUMBERS
            printLineNumber(from[k], to[*iTo - numberContextL]);
#endif
            printLinesContextRows(from, k, *iFrom);
         } else {   // one block
            printLinesContextRows(from, jFrom, *iFrom);
         }
      }
   } else if (! latest) {   // first lines not differing, but later
      k = (*iFrom > numberContextL ? *iFrom - numberContextL : 0);
#ifndef NO_LINE_NUMBERS
      printLineNumber(from[k], to[k]);
#endif
      printLinesContextRows(from, k, *iFrom);
   }
}   // printLinesContext()
void WikidiffLX::printLinesContextRows(const LineVector context,
                                       const size_t iBeg,
                                       const size_t iEnd)
{
   size_t i;
   size_t k = iBeg;
   for (i = iBeg; i < iEnd; i++) {
      if (context[i].is_HardBreak()) {
         printLinesContextRow(context, k, i);
         k = i;
      }
   }   // for i
   if (k < iEnd) {
      printLinesContextRow(context, k, iEnd);
   }
}   // printLinesContextRows()
void WikidiffLX::printLinesContextRow(const LineVector context,
                                      const size_t iBeg,
                                      const size_t iEnd)
{
   // Print paragraph for context  <tr ...</tr>
   size_t i;
   result += "<tr>\n"
             "  <td class=\"diff-marker\"> </td>\n"
             "  <td class=\"diff-context\"><div>";
   for (i = iBeg; i < iEnd; i++) {
      if (i != iBeg) {
         result += " ";   // drop invisible whitespace, pay for a space
      }
      printText(context[i].get_body());
   }   // for i
   result += "</div></td>\n"
             "  <td class=\"diff-marker\"> </td>\n"
             "  <td class=\"diff-context\"><div>";
   for (i = iBeg; i < iEnd; i++) {
      if (i != iBeg) {
         result += " ";   // drop invisible whitespace, pay for a space
      }
      printText(context[i].get_body());
   }   // for i
   result += "</div></td>\n</tr>\n";
}   // printLinesContextRow()
#ifndef NO_LINE_NUMBERS
void WikidiffLX::printLineNumber(const Line f, const Line t)
{
   snprintf(scratch256, 256,
            "<tr>\n"
            "  <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n"
            "  <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n"
            "</tr>\n",
            f.get_lineNumber(), t.get_lineNumber());
   result += scratch256;
}   // printLineNumber()
#endif
void WikidiffLX::printLinesDiff(const LineVector from,
                                const LineVector to,
                                const size_t nFrom,
                                const size_t nTo,
                                size_t * iFrom,
                                size_t * iTo)
{
   // Print sequence of differing lines
   int mL;
   int mR;
   while (*iFrom < nFrom || *iTo < nTo) {
      mL = DiffOp<Line>::copy;
      mR = DiffOp<Line>::copy;
      if (*iFrom < nFrom) {
         mL = from[*iFrom].get_diffCode();
         if (mL == DiffOp<Line>::del) {
            mL = printLinesDelete(from, nFrom, iFrom);
         }
      }
      if (*iTo < nTo) {
         mR = to[*iTo].get_diffCode();
         if (mR == DiffOp<Line>::del) {
            mR = printLinesAdd(to, nTo, iTo);
         }
      }
      if (mL == DiffOp<Line>::copy && mR == DiffOp<Line>::copy) {
         break;   // while
      } else {
         printLinesChange(from, to, nFrom, nTo, iFrom, iTo);
      }
   }   // while  <nFrom && <nTo
}   // printLinesDiff()
int WikidiffLX::printLineSingleCell(const LineVector vector,
                                    const size_t n,
                                    size_t * i)
{
   // Print table cell for 'del' or 'add'
   // Return number of trailing lines
   const Line *e;
   size_t more = 0;
   while (*i < n) {
      e = &vector[*i];
      (*i)++;
      printText(e->get_body());
      if (e->is_HardBreak()) {
         more = e->get_trailingCount();
         break;   // while
      } else {
         result += " ";
      }
   }   // while  true
   result += "</div></td>\n</tr>\n";
   return more;
}   // printLineSingleCell()
int WikidiffLX::printLinesDelete(const LineVector from,
                                 const size_t nFrom,
                                 size_t * iFrom)
{
   // Print blocks of deleted lines starting at from[*iFrom]
   // Return next opcode
   size_t more;
   int next = DiffOp<Line>::copy;
   while (*iFrom < nFrom) {
      next = from[*iFrom].get_diffCode();
      if (next == DiffOp<Line>::copy) {
         break;   // while
      } else {
         result += TR_LEFT "<div>";
         more = printLineSingleCell(from, nFrom, iFrom);
         while (more) {
            result += TR_LEFT "&nbsp;</td>\n</tr>\n";
            more--;
         }   // while
      }
   }   // while  <nFrom
   return next;
}   // printLinesDelete()
int WikidiffLX::printLinesAdd(const LineVector to,
                              const size_t nTo,
                              size_t * iTo)
{
   // Print blocks of added lines starting at to[*iTo]
   // Return next opcode
   size_t more;
   int next = DiffOp<Line>::copy;
   while (*iTo < nTo) {
      next = to[*iTo].get_diffCode();
      if (next == DiffOp<Line>::copy) {
         break;   // while
      } else {
         result += TD_LEFT TD_RIGHT "<div>";
         more = printLineSingleCell(to, nTo, iTo);
         while (more) {
            result += TD_LEFT TD_RIGHT "&nbsp;</td>\n</tr>\n";
            more--;
         }   // while
      }
   }   // while  <nTo
   return next;
}   // printLinesAdd()
void WikidiffLX::printLinesChange(const LineVector from,
                                  const LineVector to,
                                  const size_t nFrom,
                                  const size_t nTo,
                                  size_t * iFrom,
                                  size_t * iTo)
{
   // Print pairs of blocks of 'change' starting at from[*iFrom]/to[*iTo]
   int mL;
   int mR;
   while (*iFrom < nFrom && *iTo < nTo) {
      mL = from[*iFrom].get_diffCode();
      mR = to[*iTo].get_diffCode();
      if (mL == DiffOp<Line>::change && mR == DiffOp<Line>::change) {
         printLineChangeRow(from, to, nFrom, nTo, iFrom, iTo);
      } else if (mL == DiffOp<Line>::change) {
         printLinesDelete(from, nFrom, iFrom);
      } else if (mR == DiffOp<Line>::change) {
         printLinesAdd(to, nTo, iTo);
      } else {
         break;   // while
      }
   }   // while  <nFrom && <nTo
   while (*iFrom < nFrom) {
      if (from[*iFrom].get_diffCode() == DiffOp<Line>::change) {
         printLinesDelete(from, nFrom, iFrom);
      } else {
         break;   // while
      }
   }   // while  <nFrom
   while (*iTo < nTo) {
      if (to[*iTo].get_diffCode() == DiffOp<Line>::change) {
         printLinesAdd(to, nTo, iTo);
      } else {
         break;   // while
      }
   }   // while  <nTo
}   // printLinesChange()
void WikidiffLX::printLineChangeRow(const LineVector from,
                                    const LineVector to,
                                    const size_t nFrom,
                                    const size_t nTo,
                                    size_t * iFrom,
                                    size_t * iTo)
{
   // Print table cells for 'change'
   // Save worddiff results from left cell for right cell
   WordsVector wordsTo;
   WordDiffVector worddiffs;
   result += TR_LEFT;
   size_t moreL = printLineChange(from, nFrom, true, iFrom, &wordsTo, &worddiffs);
   result += TD_RIGHT;
   size_t moreR = printLineChange(to, nTo, false, iTo, &wordsTo, &worddiffs);
   result += "</td>\n</tr>\n";
   if (moreL + moreR) {
      const Line *eL = &from[(*iFrom)-1];
      const Line *eR = &to[(*iTo)-1];
      size_t m;
      size_t min = std::min(moreL, moreR);
      for (m = 0; m < min; m++) {
         result += TR_LEFT;
         if (eL->equals_trailing(eR, m)) {
            result += "&nbsp;" TD_RIGHT "&nbsp;";
         } else {
            printSpacediff(eL->get_trailingLength(m));
            result += TD_RIGHT;
            printSpacediff(eR->get_trailingLength(m));
         }
         result += "</td>\n</tr>\n";
      }   // for m
      for (m = min; m < moreL; m++) {
         result += TR_LEFT "&nbsp;</td>\n</tr>\n";
      }   // for m
      for (m = min; m < moreR; m++) {
         result += TD_LEFT TD_RIGHT "&nbsp;</td>\n</tr>\n";
      }   // for m
   }
}   // printLineChangeRow()
int WikidiffLX::printLineChange(const LineVector change,
                                const size_t n,
                                const bool left,
                                size_t * i,
                                WordsVector * wordsTo,
                                WordDiffVector * worddiffs)
{
   // Print one table cell for 'change'
   // Return number of trailing lines
   const Line *e;
   const Line *f;
   size_t more = 0;
   size_t kDiff = 0;
   result += "<div>";
   while (*i < n) {
      e = &change[*i];
      (*i)++;
      f = e->get_counterPart();
      if (f) {
         if (e->equals_body()) {
            printText(e->get_body());
            if (! e->equals_suffix(f)) {
               printSpacediff(e->get_suffixLength());
            }
         } else if (left) {
            WordVector words1;
            wordsTo->push_back(WordVector());
            explodeWords(e->get_bodyBegin(), e->get_bodyEnd(), words1);
            explodeWords(f->get_bodyBegin(), f->get_bodyEnd(),
                         (*wordsTo)[kDiff]);
            worddiffs->push_back(WordDiff(words1, (*wordsTo)[kDiff]));
            printWordDiff((*worddiffs)[kDiff], false);
            kDiff++;
         } else {
            printWordDiff((*worddiffs)[kDiff], true);
            kDiff++;
         }
      } else {   // virtual line without counterPart
         result += SPAN_INLINE;
         printText(e->get_body());
         result += "</span>";
      }
      if (e->is_HardBreak()) {
         more = e->get_trailingCount();
         break;   // while
      } else {
         result += " ";
      }
   }   // while  true
   result += "</div>";
   return more;
}   // printLineChange()
void WikidiffLX::printSpacediff(const size_t n)
{
   // Print a SPAN with n whitespace replacement characters
   result += SPAN_INLINE;
   for (size_t i = 0;  i < n;  i++) {
      result += SPACE_DIFF;
   }   // for i
   result += "</span>";
}   // printSpacediff()

printText()

[edit]

Based on wikidiff2.

void WikidiffLX::printText(const String & input)
{
   size_t start = 0;
   size_t end = input.find_first_of("<>&");
   while (end != String::npos) {
      if (end > start) {
         result.append(input, start, end - start);
      }
      switch (input[end]) {
         case '<':
            result.append("&lt;");
            break;
         case '>':
            result.append("&gt;");
            break;
         default /*case '&'*/:
            result.append("&amp;");
      }
      start = end + 1;
      end = input.find_first_of("<>&", start);
   }
   // Append the rest of the string after the last special character
   if (start < input.size()) {
      result.append(input, start, input.size() - start);
   }
}   // printText()
void WikidiffLX::printTextRed(const String & text)
{
   // HTML source code in UTF-8, general colour red, zero-width chars
   const Iterator pE = text.end();
   Iterator p = text.begin();
   size_t i = 0;
   size_t j = 0;
   size_t k = 0;
   wchar_t ch;
   unsigned char b;
   while (p < pE) {
      b = (unsigned char)*p;
      j++;
      if (b < 0xC0) {   // ASCII or undefined
         if (b == 0x26 || b == 0x3D) {   // & <
            k = j;
            j--;
         }
      } else {   // UTF range
         ch = nextUtf8Char(p, b, pE);
         if (ch == 0xAD ||
             ch >= 0x200B && ch <= 0x200F ||
             ch >= 0x202A && ch <= 0x202E) {
            k = j + (ch < 0x0800 ? 1 : 2);
            j--;
         }
      }
      if (k) {
         if (j > i) {
            result.append(text, i, j - i);
         }
         if (b == 0x26) {
            result.append("&amp;");
         } else if (b == 0x3D) {
            result.append("&lt;");
         } else {   // UTF
            result.append(SPACE_DIFF);
         }
         i = k;
         j = k;
         k = 0;
      }
   }   // while (p < pE)
   if (j > i) {
      result.append(text, i, j - i);
   }
}   // printTextRed()

printWordDiff()

[edit]

Based on wikidiff2.

  • Added printWordDiffSideBlack()
  • Modified printWordDiffSide() to reflect whether both adjacent words are black and shall present whitespace differences.
void WikidiffLX::printWordDiff(WordDiff worddiff,
                               const bool added)
{
   const DiffOp<Word> * op;
   DiffOp<Word>::PointerVector current;
   DiffOp<Word>::PointerVector others;
   String word;
   const size_t nw = worddiff.size();
   const size_t nwb = nw - 1;
   const size_t nwb2 = (nwb ? nw - 2 : 0);
   size_t n, j;
   bool lastBlack;
   for (size_t i = 0; i < nw; ++i) {
      op = &worddiff[i];
      if (op->op == DiffOp<Word>::copy) {
         lastBlack = (i < nwb);
         if (added) {
            current = op->to;
            others = op->from;
         } else {
            current = op->from;
            others = op->to;
         }
         if (lastBlack) {
            switch (worddiff[i+1].op) {
               case DiffOp<Word>::copy :
                  lastBlack = false;
                  break;
               case DiffOp<Word>::del :
                  if (added) {
                     if (i < nwb2) {
                        lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy);
                     } else {
                        lastBlack = false;
                     }
                  }
                  break;
               case DiffOp<Word>::add :
                  if (! added) {
                     if (i < nwb2) {
                        lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy);
                     } else {
                        lastBlack = false;
                     }
                  }
                  break;
               case DiffOp<Word>::change :   // skip
                  break;
            }   // switch
         }
         printWordDiffBlack(current, others, lastBlack, word);
      } else if (!added && (op->op == DiffOp<Word>::del ||
                            op->op == DiffOp<Word>::change)) {
         n = op->from.size();
         result += SPAN_INLINE;
         for (j = 0; j < n; j++) {
            op->from[j]->get_whole(word);
            printText(word);
         }
         result += "</span>";
      } else if (added && (op->op == DiffOp<Word>::add || op->op == DiffOp<Word>::change)) {
         n = op->to.size();
         result += SPAN_INLINE;
         for (j = 0; j < n; j++) {
            op->to[j]->get_whole(word);
            printText(word);
         }
         result += "</span>";
      }
   }
}   // printWordDiff()
void WikidiffLX::printWordDiffBlack(const DiffOp<Word>::PointerVector current,
                                        const DiffOp<Word>::PointerVector others,
                                        const bool lastBlack,
                                        String & word) {
   // Print a block of words in black, but possibly with space diffs.
   // The last black word before a red one is indicated by lastBlack.
   size_t k;
   const size_t n = current.size();
   const size_t meet = (lastBlack ? n-1 : n+1);
   const Word *item;
   for (size_t j = 0;  j < n;  j++) {
      item = current[j];
      if (j == meet) {
         item->get_whole(word);
         printText(word);
      } else {
         if (item->equals_suffix(others[j])) {
            item->get_whole(word);
            printText(word);
         } else {   // space diff
            k = item->get_suffixlength();
            // may be limited to a certain number of spaces
            if (k == 0) {   // last word in line
               item->get_whole(word);
               printText(word);
            } else {
               item->get_body(word);
               printText(word);
               result += SPAN_INLINE;
               for (size_t i = 0;  i < k;  i++) {
                  result += SPACE_DIFF;
               }   // for i
               result += "</span>";
            }
         }
      }
   }   // for j
}   // printWordDiffBlack()

recoverParagraphs()

[edit]

New methodology for postprocessing of virtual lines.

  • Precondition: virtual lines might change op code within paragraph
  • Postcondition: every paragraph has a unique op code
void WikidiffLX::recoverParagraphs(LineVector from,
                                   LineVector to) {
   bool learning;
   bool left = true;
   diffAdjustSingle(from, DiffOp<Line>::del);
   diffAdjustSingle(to, DiffOp<Line>::add);
   diffIncompleteChanges(from);
   learning = diffIncompleteChanges(to);
   while (learning) {
      learning = diffIncompleteChanges((left ? from : to));
      left = ! left;
   }   // while learning
}   // recoverParagraphs()
void WikidiffLX::diffAdjustSingle(LineVector vector,
                                  const int opMerge)
{
   // Ensure that the entire paragraph containing opMerge is either
   //  * single paragraph opMerge only
   //  * 'change' (if mixed virtual lines)
   Line *e;
   size_t k;
   size_t m = 0;
   const size_t n = vector.size();
   bool leap = false;
   for (size_t i = 0; i < n; i++) {
      e = &vector[i];
      if (e->is_HardBreak()) {
         m = i + 1;
      }
      if (e->get_diffCode() == opMerge) {
         leap = false;
         for (k = m; k < n; k++) {
            e = &vector[k];
            if (e->get_diffCode() != opMerge) {
               leap = true;
            }
            if (e->is_HardBreak()) {
               break;
            }
         }   // for k
         if (k != m) {   // virtual lines
            if (leap) {   // / mixed op
               i = k;
               for (k = m; k < i; k++) {
                  e = &vector[k];
                  e->set_diffCopyChange();
               }   // for k
            }
         }
      }
   }   // for i
}   // diffAdjustSingle()
bool WikidiffLX::diffIncompleteChanges(LineVector vector)
{
   // Unify paragraphs with partial change code
   const size_t n = vector.size();
   Line *e;
   Line *f;
   size_t k;
   size_t m = 0;
   bool learning = false;
   for (size_t i = 0; i < n; i++) {
      e = &vector[i];
      if (e->is_HardBreak()) {
         m = i + 1;
      }
      if (e->is_Copy()) {
         for (k = m; k < n; k++) {
            e = &vector[k];
            f = e->get_counterPart();
            if (f->is_Change()) {
               e->set_diffCopyChange();
               learning = true;
            }
            if (e->is_HardBreak()) {
               i = k;
               break;
            }
         }   // for k
      }
   }   // for i
   return learning;
}   // diffIncompleteChanges()