User:PerfektesChaos/WikidiffLX/coding/WikidiffLX.cpp
Based on wikidiff2/wikidiff2.cpp (rev 83886 Mar 2011).
Declarations in WikidiffLX.h
- Renamed following the new generation name
- Added many methods
head
[edit]/**
* Diff formatter, based on code by Steinar H. Gunderson,
* converted to work with the Dairiki diff engine by Tim Starling (Wikidiff2)
*
* Extended for WikidiffLX by PerfektesChaos@de.wikipedia 2011
* GPL.
*/
#include "WikidiffLX.h"
#include <string>
#include <thai/thailib.h>
#include <thai/thwchar.h>
#include <thai/thbrk.h>
#include <stdio.h>
#define SPAN_INLINE "<span class=\"diffchange diffchange-inline\">"
#define TD_LEFT "<tr>\n <td colspan=\"2\"> "
#define TD_RIGHT "</td>\n" \
" <td class=\"diff-marker\">+</td>\n" \
" <td class=\"diff-addedline\">"
#define TR_LEFT "<tr>\n <td class=\"diff-marker\">−</td>\n" \
" <td class=\"diff-deletedline\">"
public
[edit]execute()
[edit]Based on wikidiff2 with following changes:
- merged with former
diffLines()
- delegating HTML output to
printLines()
Major task controlling line separation, diff engine, postprocessing engine result, formatting HTML.
const WikidiffLX::String & WikidiffLX::execute(const String & text1,
const String & text2,
int numContextLines)
{
numberContextL = numContextLines;
// Allocate some result space to avoid excessive copying
result.clear();
result.reserve(text1.size() + text2.size() + MAX_DIFF_LINE);
// Split input strings into lines
LineVector lines1;
LineVector lines2;
explodeLines(text1, lines1);
explodeLines(text2, lines2);
// Do the diff
LineDiff linediff(lines1, lines2);
// Transfer engine result into lines
diffResult(linediff);
// Turn whitespace only differences into changes
diffWhitespace(lines1);
// Adjust virtual line segmentation
recoverParagraphs(lines1, lines2);
// Format line differences as HTML
printLines(lines1, lines2);
// Return a reference to the result buffer
return result;
} // execute()
private
[edit]diffResult()
[edit]New methodology for postprocessing of virtual lines. Result of engine is transferred from LineDiff into Line objects. Note that the two LineVector collections are accessed by their pointers in DiffOp
.
The block structure of combined op codes in LineDiff is part of the engine algorithm, located deep inside. That should not be changed. It would affect Word presentation as well as possible exchange of the engine. The Diff object does not permit array manipulation. The least confusing way is to transfer the engine result into the Line objects.
void WikidiffLX::diffResult(LineDiff diffs)
{
// Transfer engine result into Line objects
const size_t nDiff = diffs.size();
size_t nBoth;
size_t nFrom;
size_t nTo;
size_t j;
DiffOp<Line> *diff;
LinePVector fv;
LinePVector tv;
Line *f;
Line *t;
for (size_t i = 0; i < nDiff; i++) {
diff = &diffs[i];
switch (diff->op) {
case DiffOp<Line>::copy :
case DiffOp<Line>::change :
fv = diff->from;
tv = diff->to;
nFrom = fv.size();
nTo = tv.size();
nBoth = std::min(nFrom, nTo);
for (j = 0; j < nBoth; j++) {
f = (Line *)fv[j];
t = (Line *)tv[j];
f->set_diff(diff->op, t);
t->set_diff(diff->op, f);
} // for j
for (j = nBoth; j < nFrom; j++) {
f = (Line *)fv[j];
f->set_diff(DiffOp<Line>::del);
} // for j
for (j = nBoth; j < nTo; j++) {
t = (Line *)tv[j];
t->set_diff(DiffOp<Line>::add);
} // for j
break;
case DiffOp<Line>::del :
fv = diff->from;
nFrom = fv.size();
for (j = 0; j < nFrom; j++) {
f = (Line *)fv[j];
f->set_diff(DiffOp<Line>::del);
} // for j
break;
case DiffOp<Line>::add :
tv = diff->to;
nTo = tv.size();
for (j = 0; j < nTo; j++) {
t = (Line *)tv[j];
t->set_diff(DiffOp<Line>::add);
} // for j
break;
} // switch .op
} // for i
} // diffResult()
diffWhitespace()
[edit]New methodology for trailing whitespace at end of line and invisible trailing lines. Major part of work is done within Line objects.
Note that this is called once for from
only but does affect the to
counterparts also.
void WikidiffLX::diffWhitespace(LineVector from)
{
// detect whitespace only differences and translate into changes
const size_t n = from.size();
for (size_t i = 0; i < n; i++) {
from[i].whitespaceOnly();
} // for i
} // diffWhitespace()
explodeLines()
[edit]Based on explodeLines()
in wikidiff2 but recognizing breaks of virtual lines. Detect invisible trailing lines, separate from lines with visible content and hide them in Line objects.
Usage of allocated String for line text replaced by iterators.
void WikidiffLX::explodeLines(const String & text,
LineVector & lines)
{
const Iterator pE = text.end();
Iterator pB = text.begin();
Iterator p;
Iterator pH;
Iterator pV;
Iterator pS;
Iterator pT;
unsigned char b;
#ifndef NO_LINE_NUMBERS
size_t k = 1;
size_t keep = 1;
#endif
while (pB != pE) {
pH = std::find(pB, pE, '\n');
while (pH - pB > LINELENGTH_VIRTUAL) {
pT = pH - LINELENGTH_MIN;
findVirtualBreak(pB + LINELENGTH_MIN, pT, &pV);
if (pV == pT) { // not found
break; // while
} else {
pS = pV;
while (pS != pE) {
b = (unsigned char)*pS;
if (b == 0x20) { // or other invisibles
pS++;
} else {
break; // while
}
} // while
if (pH - pS > LINELENGTH_MIN) {
#ifdef NO_LINE_NUMBERS
lines.push_back(Line(pB, pV, pS));
#else
lines.push_back(Line(pB, pV, pS, keep));
#endif
pB = pS;
} else {
break; // while
}
}
} // while pH - pB > LINELENGTH_VIRTUAL
pS = pH;
p = pH;
while (p > pB) {
p--;
b = (unsigned char)*p;
if (b == 0x20) { // or other invisibles
pS = p;
} else {
break; // while
}
} // while suffix
if (pH == pE) {
pT = pE;
} else {
pT = pH;
p = pT;
while (true) {
if (p == pE) {
if (pH == pB) {
pH = pE;
}
pT = pE;
break; // while
} else {
b = (unsigned char)*p;
if (b > 0x20) { // may be other invisibles (Unicode)
break; // while
} else if (b == 0x0A) {
#ifndef NO_LINE_NUMBERS
if (pT != pH) {
k++;
}
#endif
pT = p + 1;
}
}
p++;
} // while trailing
}
#ifdef NO_LINE_NUMBERS
lines.push_back(Line(pB, pS, pH, pT));
#else
lines.push_back(Line(pB, pS, pH, pT, keep));
k++;
keep = k;
#endif
pB = pT;
if (pT != pE) {
pT++;
}
} // while pB != pE
} // explodeLines()
explodeWords()
[edit]Based on explodeWords()
in wikidiff2 but completely rewritten and accelerated. Handling of Unicode spaces introduced. Thai processing started only if really Thai character. Every CJK symbol is treated as single word now.
Not yet included:
- rev:107135 possible future direction: ICU.
void WikidiffLX::explodeWords(const String & text,
WordVector &words)
{
// Split a string into words
const Iterator pE = text.end();
Iterator p = text.begin();
// Don't try to do a word-level diff on very long lines
if (text.size() > MAX_DIFF_LINE) {
words.push_back(Word(p, pE, pE));
return;
}
Iterator chStart;
Iterator suffixStart;
Iterator wordStart;
unsigned int ch;
unsigned char b;
bool livingSuffix = false;
bool livingWord = false;
bool locateSuffix;
while (p < pE) {
chStart = p;
locateSuffix = false;
b = (unsigned char)*p;
if (b < 0xC0) { // ASCII or undefined
if (b < 0x21) {
locateSuffix = (b == 0x20 || 0x09); // some more ...
}
} else { // UTF range
ch = nextUtf8Char(p, b, pE);
if (ch > 0x0E00) { // could be interesting
if (ch > 0x0E00 && ch <= 0x0E5F) { // Thai
explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words);
wordStart = chStart;
explodeWordsThai(ch, p, pE, words, wordStart);
} else if (ch >= 0x2002 && ch <= 0x200A) { // spaces
locateSuffix = true;
} else if (ch >= 0x1100 && ch <= 0x11FF ||
ch >= 0x2E80 && ch <= 0x9FFF ||
ch >= 0xAC00 && ch <= 0xD7AF ||
ch >= 0x20000 && ch <= 0x2FA1F) { // CJK
explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, chStart, words);
wordStart = chStart;
words.push_back(Word(chStart, p, p));
}
}
}
if (locateSuffix) {
if (! livingSuffix) {
suffixStart = chStart;
livingSuffix = true;
}
} else if (livingSuffix) {
explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, p, words);
} else if (! livingWord) {
wordStart = chStart;
livingWord = true;
}
++p;
} // while (p < pE)
explodeWordSuffix(livingWord, wordStart, livingSuffix, suffixStart, pE, words);
} // explodeWords()
void WikidiffLX::explodeWordSuffix(bool & livingWord,
const Iterator wordStart,
bool & livingSuffix,
const Iterator suffixStart,
const Iterator wordNext,
WordVector & words)
{
// Add word body and suffix interval to words, if any; reset living
if (livingWord) { // regular word pending
if (wordStart < wordNext) {
words.push_back(Word(wordStart, (livingSuffix?suffixStart:wordNext), wordNext));
}
} else if (livingSuffix) { // append to previous CJK or Thai word
if (! words.empty()) { // not the beginning of the line
words.pop_back();
words.push_back(Word(wordStart, suffixStart, wordNext));
}
}
livingWord = false;
livingSuffix = false;
} // explodeWordSuffix()
explodeWordsThai()
[edit]Re-written procedure for handling of substring containing thai characters only in Thai line splitting into words within explodeWords().
void WikidiffLX::explodeWordsThai(unsigned int ch,
Iterator & p,
const Iterator pE,
WordVector & words,
Iterator & wordStart)
{
// Pointing on thai character sequence and add particular thai words.
// Thai chars are in U+0E00...0E5F each represented by three bytes UTF8.
IntVector thaiBreakPositions;
String tisText;
Iterator pT = p;
Iterator p0 = wordStart;
int nBreaks;
size_t max = (pE - p) / 3 + 1;
unsigned char b;
tisText.reserve(max);
tisText = th_uni2tis(ch);
while (pT < pE) {
b = (unsigned char)*pT;
if (b == 0xC0) {
ch = nextUtf8Char(pT, b, pE);
if (ch > 0x0E00 && ch <= 0x0E5F) {
tisText += th_uni2tis(ch);
p = pT;
pT++;
} else { // not in Thai range
break; // while
}
} else { // out of relevant UTF8
break; // while
}
} // while (pT < pE)
tisText += '\0';
max = tisText.size();
thaiBreakPositions.resize(max);
nBreaks = th_brk((const thchar_t*)(tisText.data()), &thaiBreakPositions[0], max);
for (int i = 0; i < nBreaks; i++) {
pT = p0 + 3 * thaiBreakPositions[i];
words.push_back(Word(wordStart, pT, pT));
wordStart = pT;
} // for i
} // explodeWordsThai()
findVirtualBreak()
[edit]New method for detection of breaks of virtual lines within explodeLines().
void WikidiffLX::findPunctSpace(const Iterator pB,
const Iterator pE,
const unsigned char punct,
Iterator * p)
{
// Detect punct+space; return point after punct or pE
Iterator pR = pE;
unsigned char b;
*p = std::find(pB, pE, punct);
while (*p != pE) {
*p++;
if (*p != pE) {
b = (unsigned char)**p;
if (b == 0x20) {
break; // while
} else {
*p = std::find(*p, pE, punct);
}
}
}
} // findPunctSpace()
void WikidiffLX::findVirtualBreak(const Iterator pB,
const Iterator pE,
Iterator * p)
{
// Detect virtual line break; return point after terminator
*p = pE;
findPunctSpace(pB, *p, '.', p);
findPunctSpace(pB, *p, '?', p);
findPunctSpace(pB, *p, '!', p);
/*
Other candidates, requiring UTF8 decoding:
* U+3002 CJK period
*/
} // findVirtualBreak()
nextUtf8Char()
[edit]Based on wikidiff2 but demanding 32bit characters explicitly.
unsigned int WikidiffLX::nextUtf8Char(Iterator & p,
unsigned char b,
const Iterator pE)
{
// Weak UTF-8 decoder
// Will return garbage on invalid input (overshort sequences, overlong sequences, etc.)
// but Mediawiki never provides bad UTF-8 texts ...
short seqLength = 0;
unsigned int c = 0;
while (p < pE) {
if (b < 0x80) {
c = b;
seqLength = 0;
} else if (b >= 0xC0) {
// Start of UTF-8 character
// If this is unexpected, due to an overshort sequence,
// we ignore the invalid sequence and resynchronise here
if (b < 0xE0) {
seqLength = 1;
c = b & 0x1F;
} else if (b < 0xF0) {
seqLength = 2;
c = b & 0x0F;
} else {
seqLength = 3;
c = b & 7;
}
} else if (seqLength) {
c <<= 6;
c |= b & 0x3F;
--seqLength;
} else {
// Unexpected continuation, ignore
}
++p;
if (! seqLength) {
break; // while
}
} // while (p < pE)
return c;
} // nextUtf8Char()
printLines()
[edit]Mainly new code for the presentation of paragraphs. The respective diffLines()
procedure in wikidiff2 is replaced by printLines()
. Handling of virtual line sequences is implemented by filling table cells until hard break encountered. Trailing invisible lines are discovered and presented as table cells where necessary.
void WikidiffLX::printLines(const LineVector from,
const LineVector to)
{
// Print all diff into result as HTML
// linediff is adjusted with hard breaks at .op borders
// .op=change lines are merged as single paragraphs
const size_t nFrom = from.size();
const size_t nTo = to.size();
size_t iFrom = 0;
size_t iTo = 0;
bool leap = from[0].is_Copy() && to[0].is_Copy();
while (iFrom < nFrom && iTo < nTo) {
if (leap) {
printLinesContext(from, to, nFrom, nTo, &iFrom, &iTo);
} else {
printLinesDiff(from, to, nFrom, nTo, &iFrom, &iTo);
}
leap = ! leap;
} // while
} // printLines()
void WikidiffLX::printLinesContext(const LineVector from,
const LineVector to,
const size_t nFrom,
const size_t nTo,
size_t * iFrom,
size_t * iTo)
{
// Print unchanged lines as context before / after diff
// *iFrom,*iTo changing from start of copy block to end of copy block
const size_t jFrom = *iFrom; // end of previous differing
const size_t jTo = *iTo; // end with following differing
size_t k;
bool latest = true; // last block at all
for ( ; *iFrom < nFrom; *iFrom++) {
if (! from[*iFrom].is_Copy()) {
latest = false;
break; // for *iFrom
}
} // for *iFrom
for ( ; *iTo < nTo; *iTo++) {
if (! to[*iTo].is_Copy()) {
latest = false;
break; // for *iTo
}
} // for *iTo
if (jFrom || jTo) { // pending trailer required
k = jFrom + numberContextL;
if (latest) {
printLinesContextRows(from, *iFrom, std::min(k, nFrom));
} else { // differing block follows
if (*iFrom - jFrom > 2 * numberContextL + 1) { // separate
printLinesContextRows(from, jFrom, jFrom + numberContextL);
k = *iFrom - numberContextL;
#ifndef NO_LINE_NUMBERS
printLineNumber(from[k], to[*iTo - numberContextL]);
#endif
printLinesContextRows(from, k, *iFrom);
} else { // one block
printLinesContextRows(from, jFrom, *iFrom);
}
}
} else if (! latest) { // first lines not differing, but later
k = (*iFrom > numberContextL ? *iFrom - numberContextL : 0);
#ifndef NO_LINE_NUMBERS
printLineNumber(from[k], to[k]);
#endif
printLinesContextRows(from, k, *iFrom);
}
} // printLinesContext()
void WikidiffLX::printLinesContextRows(const LineVector context,
const size_t iBeg,
const size_t iEnd)
{
size_t i;
size_t k = iBeg;
for (i = iBeg; i < iEnd; i++) {
if (context[i].is_HardBreak()) {
printLinesContextRow(context, k, i);
k = i;
}
} // for i
if (k < iEnd) {
printLinesContextRow(context, k, iEnd);
}
} // printLinesContextRows()
void WikidiffLX::printLinesContextRow(const LineVector context,
const size_t iBeg,
const size_t iEnd)
{
// Print paragraph for context <tr ...</tr>
size_t i;
result += "<tr>\n"
" <td class=\"diff-marker\"> </td>\n"
" <td class=\"diff-context\"><div>";
for (i = iBeg; i < iEnd; i++) {
if (i != iBeg) {
result += " "; // drop invisible whitespace, pay for a space
}
printText(context[i].get_body());
} // for i
result += "</div></td>\n"
" <td class=\"diff-marker\"> </td>\n"
" <td class=\"diff-context\"><div>";
for (i = iBeg; i < iEnd; i++) {
if (i != iBeg) {
result += " "; // drop invisible whitespace, pay for a space
}
printText(context[i].get_body());
} // for i
result += "</div></td>\n</tr>\n";
} // printLinesContextRow()
#ifndef NO_LINE_NUMBERS
void WikidiffLX::printLineNumber(const Line f, const Line t)
{
snprintf(scratch256, 256,
"<tr>\n"
" <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n"
" <td colspan=\"2\" class=\"diff-lineno\"><!--LINE %u--></td>\n"
"</tr>\n",
f.get_lineNumber(), t.get_lineNumber());
result += scratch256;
} // printLineNumber()
#endif
void WikidiffLX::printLinesDiff(const LineVector from,
const LineVector to,
const size_t nFrom,
const size_t nTo,
size_t * iFrom,
size_t * iTo)
{
// Print sequence of differing lines
int mL;
int mR;
while (*iFrom < nFrom || *iTo < nTo) {
mL = DiffOp<Line>::copy;
mR = DiffOp<Line>::copy;
if (*iFrom < nFrom) {
mL = from[*iFrom].get_diffCode();
if (mL == DiffOp<Line>::del) {
mL = printLinesDelete(from, nFrom, iFrom);
}
}
if (*iTo < nTo) {
mR = to[*iTo].get_diffCode();
if (mR == DiffOp<Line>::del) {
mR = printLinesAdd(to, nTo, iTo);
}
}
if (mL == DiffOp<Line>::copy && mR == DiffOp<Line>::copy) {
break; // while
} else {
printLinesChange(from, to, nFrom, nTo, iFrom, iTo);
}
} // while <nFrom && <nTo
} // printLinesDiff()
int WikidiffLX::printLineSingleCell(const LineVector vector,
const size_t n,
size_t * i)
{
// Print table cell for 'del' or 'add'
// Return number of trailing lines
const Line *e;
size_t more = 0;
while (*i < n) {
e = &vector[*i];
(*i)++;
printText(e->get_body());
if (e->is_HardBreak()) {
more = e->get_trailingCount();
break; // while
} else {
result += " ";
}
} // while true
result += "</div></td>\n</tr>\n";
return more;
} // printLineSingleCell()
int WikidiffLX::printLinesDelete(const LineVector from,
const size_t nFrom,
size_t * iFrom)
{
// Print blocks of deleted lines starting at from[*iFrom]
// Return next opcode
size_t more;
int next = DiffOp<Line>::copy;
while (*iFrom < nFrom) {
next = from[*iFrom].get_diffCode();
if (next == DiffOp<Line>::copy) {
break; // while
} else {
result += TR_LEFT "<div>";
more = printLineSingleCell(from, nFrom, iFrom);
while (more) {
result += TR_LEFT " </td>\n</tr>\n";
more--;
} // while
}
} // while <nFrom
return next;
} // printLinesDelete()
int WikidiffLX::printLinesAdd(const LineVector to,
const size_t nTo,
size_t * iTo)
{
// Print blocks of added lines starting at to[*iTo]
// Return next opcode
size_t more;
int next = DiffOp<Line>::copy;
while (*iTo < nTo) {
next = to[*iTo].get_diffCode();
if (next == DiffOp<Line>::copy) {
break; // while
} else {
result += TD_LEFT TD_RIGHT "<div>";
more = printLineSingleCell(to, nTo, iTo);
while (more) {
result += TD_LEFT TD_RIGHT " </td>\n</tr>\n";
more--;
} // while
}
} // while <nTo
return next;
} // printLinesAdd()
void WikidiffLX::printLinesChange(const LineVector from,
const LineVector to,
const size_t nFrom,
const size_t nTo,
size_t * iFrom,
size_t * iTo)
{
// Print pairs of blocks of 'change' starting at from[*iFrom]/to[*iTo]
int mL;
int mR;
while (*iFrom < nFrom && *iTo < nTo) {
mL = from[*iFrom].get_diffCode();
mR = to[*iTo].get_diffCode();
if (mL == DiffOp<Line>::change && mR == DiffOp<Line>::change) {
printLineChangeRow(from, to, nFrom, nTo, iFrom, iTo);
} else if (mL == DiffOp<Line>::change) {
printLinesDelete(from, nFrom, iFrom);
} else if (mR == DiffOp<Line>::change) {
printLinesAdd(to, nTo, iTo);
} else {
break; // while
}
} // while <nFrom && <nTo
while (*iFrom < nFrom) {
if (from[*iFrom].get_diffCode() == DiffOp<Line>::change) {
printLinesDelete(from, nFrom, iFrom);
} else {
break; // while
}
} // while <nFrom
while (*iTo < nTo) {
if (to[*iTo].get_diffCode() == DiffOp<Line>::change) {
printLinesAdd(to, nTo, iTo);
} else {
break; // while
}
} // while <nTo
} // printLinesChange()
void WikidiffLX::printLineChangeRow(const LineVector from,
const LineVector to,
const size_t nFrom,
const size_t nTo,
size_t * iFrom,
size_t * iTo)
{
// Print table cells for 'change'
// Save worddiff results from left cell for right cell
WordsVector wordsTo;
WordDiffVector worddiffs;
result += TR_LEFT;
size_t moreL = printLineChange(from, nFrom, true, iFrom, &wordsTo, &worddiffs);
result += TD_RIGHT;
size_t moreR = printLineChange(to, nTo, false, iTo, &wordsTo, &worddiffs);
result += "</td>\n</tr>\n";
if (moreL + moreR) {
const Line *eL = &from[(*iFrom)-1];
const Line *eR = &to[(*iTo)-1];
size_t m;
size_t min = std::min(moreL, moreR);
for (m = 0; m < min; m++) {
result += TR_LEFT;
if (eL->equals_trailing(eR, m)) {
result += " " TD_RIGHT " ";
} else {
printSpacediff(eL->get_trailingLength(m));
result += TD_RIGHT;
printSpacediff(eR->get_trailingLength(m));
}
result += "</td>\n</tr>\n";
} // for m
for (m = min; m < moreL; m++) {
result += TR_LEFT " </td>\n</tr>\n";
} // for m
for (m = min; m < moreR; m++) {
result += TD_LEFT TD_RIGHT " </td>\n</tr>\n";
} // for m
}
} // printLineChangeRow()
int WikidiffLX::printLineChange(const LineVector change,
const size_t n,
const bool left,
size_t * i,
WordsVector * wordsTo,
WordDiffVector * worddiffs)
{
// Print one table cell for 'change'
// Return number of trailing lines
const Line *e;
const Line *f;
size_t more = 0;
size_t kDiff = 0;
result += "<div>";
while (*i < n) {
e = &change[*i];
(*i)++;
f = e->get_counterPart();
if (f) {
if (e->equals_body()) {
printText(e->get_body());
if (! e->equals_suffix(f)) {
printSpacediff(e->get_suffixLength());
}
} else if (left) {
WordVector words1;
wordsTo->push_back(WordVector());
explodeWords(e->get_bodyBegin(), e->get_bodyEnd(), words1);
explodeWords(f->get_bodyBegin(), f->get_bodyEnd(),
(*wordsTo)[kDiff]);
worddiffs->push_back(WordDiff(words1, (*wordsTo)[kDiff]));
printWordDiff((*worddiffs)[kDiff], false);
kDiff++;
} else {
printWordDiff((*worddiffs)[kDiff], true);
kDiff++;
}
} else { // virtual line without counterPart
result += SPAN_INLINE;
printText(e->get_body());
result += "</span>";
}
if (e->is_HardBreak()) {
more = e->get_trailingCount();
break; // while
} else {
result += " ";
}
} // while true
result += "</div>";
return more;
} // printLineChange()
void WikidiffLX::printSpacediff(const size_t n)
{
// Print a SPAN with n whitespace replacement characters
result += SPAN_INLINE;
for (size_t i = 0; i < n; i++) {
result += SPACE_DIFF;
} // for i
result += "</span>";
} // printSpacediff()
printText()
[edit]Based on wikidiff2.
- Use
printText()
unchanged. - Added
printTextRed()
basically likeprintText()
but visualize zero-width differences within a red word.
void WikidiffLX::printText(const String & input)
{
size_t start = 0;
size_t end = input.find_first_of("<>&");
while (end != String::npos) {
if (end > start) {
result.append(input, start, end - start);
}
switch (input[end]) {
case '<':
result.append("<");
break;
case '>':
result.append(">");
break;
default /*case '&'*/:
result.append("&");
}
start = end + 1;
end = input.find_first_of("<>&", start);
}
// Append the rest of the string after the last special character
if (start < input.size()) {
result.append(input, start, input.size() - start);
}
} // printText()
void WikidiffLX::printTextRed(const String & text)
{
// HTML source code in UTF-8, general colour red, zero-width chars
const Iterator pE = text.end();
Iterator p = text.begin();
size_t i = 0;
size_t j = 0;
size_t k = 0;
wchar_t ch;
unsigned char b;
while (p < pE) {
b = (unsigned char)*p;
j++;
if (b < 0xC0) { // ASCII or undefined
if (b == 0x26 || b == 0x3D) { // & <
k = j;
j--;
}
} else { // UTF range
ch = nextUtf8Char(p, b, pE);
if (ch == 0xAD ||
ch >= 0x200B && ch <= 0x200F ||
ch >= 0x202A && ch <= 0x202E) {
k = j + (ch < 0x0800 ? 1 : 2);
j--;
}
}
if (k) {
if (j > i) {
result.append(text, i, j - i);
}
if (b == 0x26) {
result.append("&");
} else if (b == 0x3D) {
result.append("<");
} else { // UTF
result.append(SPACE_DIFF);
}
i = k;
j = k;
k = 0;
}
} // while (p < pE)
if (j > i) {
result.append(text, i, j - i);
}
} // printTextRed()
printWordDiff()
[edit]Based on wikidiff2.
- Added
printWordDiffSideBlack()
- Modified
printWordDiffSide()
to reflect whether both adjacent words are black and shall present whitespace differences.
void WikidiffLX::printWordDiff(WordDiff worddiff,
const bool added)
{
const DiffOp<Word> * op;
DiffOp<Word>::PointerVector current;
DiffOp<Word>::PointerVector others;
String word;
const size_t nw = worddiff.size();
const size_t nwb = nw - 1;
const size_t nwb2 = (nwb ? nw - 2 : 0);
size_t n, j;
bool lastBlack;
for (size_t i = 0; i < nw; ++i) {
op = &worddiff[i];
if (op->op == DiffOp<Word>::copy) {
lastBlack = (i < nwb);
if (added) {
current = op->to;
others = op->from;
} else {
current = op->from;
others = op->to;
}
if (lastBlack) {
switch (worddiff[i+1].op) {
case DiffOp<Word>::copy :
lastBlack = false;
break;
case DiffOp<Word>::del :
if (added) {
if (i < nwb2) {
lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy);
} else {
lastBlack = false;
}
}
break;
case DiffOp<Word>::add :
if (! added) {
if (i < nwb2) {
lastBlack = (worddiff[i+2].op == DiffOp<Word>::copy);
} else {
lastBlack = false;
}
}
break;
case DiffOp<Word>::change : // skip
break;
} // switch
}
printWordDiffBlack(current, others, lastBlack, word);
} else if (!added && (op->op == DiffOp<Word>::del ||
op->op == DiffOp<Word>::change)) {
n = op->from.size();
result += SPAN_INLINE;
for (j = 0; j < n; j++) {
op->from[j]->get_whole(word);
printText(word);
}
result += "</span>";
} else if (added && (op->op == DiffOp<Word>::add || op->op == DiffOp<Word>::change)) {
n = op->to.size();
result += SPAN_INLINE;
for (j = 0; j < n; j++) {
op->to[j]->get_whole(word);
printText(word);
}
result += "</span>";
}
}
} // printWordDiff()
void WikidiffLX::printWordDiffBlack(const DiffOp<Word>::PointerVector current,
const DiffOp<Word>::PointerVector others,
const bool lastBlack,
String & word) {
// Print a block of words in black, but possibly with space diffs.
// The last black word before a red one is indicated by lastBlack.
size_t k;
const size_t n = current.size();
const size_t meet = (lastBlack ? n-1 : n+1);
const Word *item;
for (size_t j = 0; j < n; j++) {
item = current[j];
if (j == meet) {
item->get_whole(word);
printText(word);
} else {
if (item->equals_suffix(others[j])) {
item->get_whole(word);
printText(word);
} else { // space diff
k = item->get_suffixlength();
// may be limited to a certain number of spaces
if (k == 0) { // last word in line
item->get_whole(word);
printText(word);
} else {
item->get_body(word);
printText(word);
result += SPAN_INLINE;
for (size_t i = 0; i < k; i++) {
result += SPACE_DIFF;
} // for i
result += "</span>";
}
}
}
} // for j
} // printWordDiffBlack()
recoverParagraphs()
[edit]New methodology for postprocessing of virtual lines.
- Precondition: virtual lines might change op code within paragraph
- Postcondition: every paragraph has a unique op code
void WikidiffLX::recoverParagraphs(LineVector from,
LineVector to) {
bool learning;
bool left = true;
diffAdjustSingle(from, DiffOp<Line>::del);
diffAdjustSingle(to, DiffOp<Line>::add);
diffIncompleteChanges(from);
learning = diffIncompleteChanges(to);
while (learning) {
learning = diffIncompleteChanges((left ? from : to));
left = ! left;
} // while learning
} // recoverParagraphs()
void WikidiffLX::diffAdjustSingle(LineVector vector,
const int opMerge)
{
// Ensure that the entire paragraph containing opMerge is either
// * single paragraph opMerge only
// * 'change' (if mixed virtual lines)
Line *e;
size_t k;
size_t m = 0;
const size_t n = vector.size();
bool leap = false;
for (size_t i = 0; i < n; i++) {
e = &vector[i];
if (e->is_HardBreak()) {
m = i + 1;
}
if (e->get_diffCode() == opMerge) {
leap = false;
for (k = m; k < n; k++) {
e = &vector[k];
if (e->get_diffCode() != opMerge) {
leap = true;
}
if (e->is_HardBreak()) {
break;
}
} // for k
if (k != m) { // virtual lines
if (leap) { // / mixed op
i = k;
for (k = m; k < i; k++) {
e = &vector[k];
e->set_diffCopyChange();
} // for k
}
}
}
} // for i
} // diffAdjustSingle()
bool WikidiffLX::diffIncompleteChanges(LineVector vector)
{
// Unify paragraphs with partial change code
const size_t n = vector.size();
Line *e;
Line *f;
size_t k;
size_t m = 0;
bool learning = false;
for (size_t i = 0; i < n; i++) {
e = &vector[i];
if (e->is_HardBreak()) {
m = i + 1;
}
if (e->is_Copy()) {
for (k = m; k < n; k++) {
e = &vector[k];
f = e->get_counterPart();
if (f->is_Change()) {
e->set_diffCopyChange();
learning = true;
}
if (e->is_HardBreak()) {
i = k;
break;
}
} // for k
}
} // for i
return learning;
} // diffIncompleteChanges()