THIS IS A TEST INSTANCE ONLY! REPOSITORIES CAN BE DELETED AT ANY TIME!

Git Source Code Mirror - This is a publish-only repository and all pull requests are ignored. Please follow Documentation/SubmittingPatches procedure for any of your improvements.
git
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1055 lines
27KB

  1. /*
  2. * LibXDiff by Davide Libenzi ( File Differential Library )
  3. * Copyright (C) 2003 Davide Libenzi
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2.1 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, see
  17. * <http://www.gnu.org/licenses/>.
  18. *
  19. * Davide Libenzi <davidel@xmailserver.org>
  20. *
  21. */
  22. #include "xinclude.h"
  23. #define XDL_MAX_COST_MIN 256
  24. #define XDL_HEUR_MIN_COST 256
  25. #define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
  26. #define XDL_SNAKE_CNT 20
  27. #define XDL_K_HEUR 4
  28. typedef struct s_xdpsplit {
  29. long i1, i2;
  30. int min_lo, min_hi;
  31. } xdpsplit_t;
  32. /*
  33. * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
  34. * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
  35. * the forward diagonal starting from (off1, off2) and the backward diagonal
  36. * starting from (lim1, lim2). If the K values on the same diagonal crosses
  37. * returns the furthest point of reach. We might encounter expensive edge cases
  38. * using this algorithm, so a little bit of heuristic is needed to cut the
  39. * search and to return a suboptimal point.
  40. */
  41. static long xdl_split(unsigned long const *ha1, long off1, long lim1,
  42. unsigned long const *ha2, long off2, long lim2,
  43. long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
  44. xdalgoenv_t *xenv) {
  45. long dmin = off1 - lim2, dmax = lim1 - off2;
  46. long fmid = off1 - off2, bmid = lim1 - lim2;
  47. long odd = (fmid - bmid) & 1;
  48. long fmin = fmid, fmax = fmid;
  49. long bmin = bmid, bmax = bmid;
  50. long ec, d, i1, i2, prev1, best, dd, v, k;
  51. /*
  52. * Set initial diagonal values for both forward and backward path.
  53. */
  54. kvdf[fmid] = off1;
  55. kvdb[bmid] = lim1;
  56. for (ec = 1;; ec++) {
  57. int got_snake = 0;
  58. /*
  59. * We need to extend the diagonal "domain" by one. If the next
  60. * values exits the box boundaries we need to change it in the
  61. * opposite direction because (max - min) must be a power of
  62. * two.
  63. *
  64. * Also we initialize the external K value to -1 so that we can
  65. * avoid extra conditions in the check inside the core loop.
  66. */
  67. if (fmin > dmin)
  68. kvdf[--fmin - 1] = -1;
  69. else
  70. ++fmin;
  71. if (fmax < dmax)
  72. kvdf[++fmax + 1] = -1;
  73. else
  74. --fmax;
  75. for (d = fmax; d >= fmin; d -= 2) {
  76. if (kvdf[d - 1] >= kvdf[d + 1])
  77. i1 = kvdf[d - 1] + 1;
  78. else
  79. i1 = kvdf[d + 1];
  80. prev1 = i1;
  81. i2 = i1 - d;
  82. for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
  83. if (i1 - prev1 > xenv->snake_cnt)
  84. got_snake = 1;
  85. kvdf[d] = i1;
  86. if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
  87. spl->i1 = i1;
  88. spl->i2 = i2;
  89. spl->min_lo = spl->min_hi = 1;
  90. return ec;
  91. }
  92. }
  93. /*
  94. * We need to extend the diagonal "domain" by one. If the next
  95. * values exits the box boundaries we need to change it in the
  96. * opposite direction because (max - min) must be a power of
  97. * two.
  98. *
  99. * Also we initialize the external K value to -1 so that we can
  100. * avoid extra conditions in the check inside the core loop.
  101. */
  102. if (bmin > dmin)
  103. kvdb[--bmin - 1] = XDL_LINE_MAX;
  104. else
  105. ++bmin;
  106. if (bmax < dmax)
  107. kvdb[++bmax + 1] = XDL_LINE_MAX;
  108. else
  109. --bmax;
  110. for (d = bmax; d >= bmin; d -= 2) {
  111. if (kvdb[d - 1] < kvdb[d + 1])
  112. i1 = kvdb[d - 1];
  113. else
  114. i1 = kvdb[d + 1] - 1;
  115. prev1 = i1;
  116. i2 = i1 - d;
  117. for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
  118. if (prev1 - i1 > xenv->snake_cnt)
  119. got_snake = 1;
  120. kvdb[d] = i1;
  121. if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
  122. spl->i1 = i1;
  123. spl->i2 = i2;
  124. spl->min_lo = spl->min_hi = 1;
  125. return ec;
  126. }
  127. }
  128. if (need_min)
  129. continue;
  130. /*
  131. * If the edit cost is above the heuristic trigger and if
  132. * we got a good snake, we sample current diagonals to see
  133. * if some of them have reached an "interesting" path. Our
  134. * measure is a function of the distance from the diagonal
  135. * corner (i1 + i2) penalized with the distance from the
  136. * mid diagonal itself. If this value is above the current
  137. * edit cost times a magic factor (XDL_K_HEUR) we consider
  138. * it interesting.
  139. */
  140. if (got_snake && ec > xenv->heur_min) {
  141. for (best = 0, d = fmax; d >= fmin; d -= 2) {
  142. dd = d > fmid ? d - fmid: fmid - d;
  143. i1 = kvdf[d];
  144. i2 = i1 - d;
  145. v = (i1 - off1) + (i2 - off2) - dd;
  146. if (v > XDL_K_HEUR * ec && v > best &&
  147. off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
  148. off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
  149. for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
  150. if (k == xenv->snake_cnt) {
  151. best = v;
  152. spl->i1 = i1;
  153. spl->i2 = i2;
  154. break;
  155. }
  156. }
  157. }
  158. if (best > 0) {
  159. spl->min_lo = 1;
  160. spl->min_hi = 0;
  161. return ec;
  162. }
  163. for (best = 0, d = bmax; d >= bmin; d -= 2) {
  164. dd = d > bmid ? d - bmid: bmid - d;
  165. i1 = kvdb[d];
  166. i2 = i1 - d;
  167. v = (lim1 - i1) + (lim2 - i2) - dd;
  168. if (v > XDL_K_HEUR * ec && v > best &&
  169. off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
  170. off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
  171. for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
  172. if (k == xenv->snake_cnt - 1) {
  173. best = v;
  174. spl->i1 = i1;
  175. spl->i2 = i2;
  176. break;
  177. }
  178. }
  179. }
  180. if (best > 0) {
  181. spl->min_lo = 0;
  182. spl->min_hi = 1;
  183. return ec;
  184. }
  185. }
  186. /*
  187. * Enough is enough. We spent too much time here and now we
  188. * collect the furthest reaching path using the (i1 + i2)
  189. * measure.
  190. */
  191. if (ec >= xenv->mxcost) {
  192. long fbest, fbest1, bbest, bbest1;
  193. fbest = fbest1 = -1;
  194. for (d = fmax; d >= fmin; d -= 2) {
  195. i1 = XDL_MIN(kvdf[d], lim1);
  196. i2 = i1 - d;
  197. if (lim2 < i2)
  198. i1 = lim2 + d, i2 = lim2;
  199. if (fbest < i1 + i2) {
  200. fbest = i1 + i2;
  201. fbest1 = i1;
  202. }
  203. }
  204. bbest = bbest1 = XDL_LINE_MAX;
  205. for (d = bmax; d >= bmin; d -= 2) {
  206. i1 = XDL_MAX(off1, kvdb[d]);
  207. i2 = i1 - d;
  208. if (i2 < off2)
  209. i1 = off2 + d, i2 = off2;
  210. if (i1 + i2 < bbest) {
  211. bbest = i1 + i2;
  212. bbest1 = i1;
  213. }
  214. }
  215. if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
  216. spl->i1 = fbest1;
  217. spl->i2 = fbest - fbest1;
  218. spl->min_lo = 1;
  219. spl->min_hi = 0;
  220. } else {
  221. spl->i1 = bbest1;
  222. spl->i2 = bbest - bbest1;
  223. spl->min_lo = 0;
  224. spl->min_hi = 1;
  225. }
  226. return ec;
  227. }
  228. }
  229. }
  230. /*
  231. * Rule: "Divide et Impera" (divide & conquer). Recursively split the box in
  232. * sub-boxes by calling the box splitting function. Note that the real job
  233. * (marking changed lines) is done in the two boundary reaching checks.
  234. */
  235. int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
  236. diffdata_t *dd2, long off2, long lim2,
  237. long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
  238. unsigned long const *ha1 = dd1->ha, *ha2 = dd2->ha;
  239. /*
  240. * Shrink the box by walking through each diagonal snake (SW and NE).
  241. */
  242. for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
  243. for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
  244. /*
  245. * If one dimension is empty, then all records on the other one must
  246. * be obviously changed.
  247. */
  248. if (off1 == lim1) {
  249. char *rchg2 = dd2->rchg;
  250. long *rindex2 = dd2->rindex;
  251. for (; off2 < lim2; off2++)
  252. rchg2[rindex2[off2]] = 1;
  253. } else if (off2 == lim2) {
  254. char *rchg1 = dd1->rchg;
  255. long *rindex1 = dd1->rindex;
  256. for (; off1 < lim1; off1++)
  257. rchg1[rindex1[off1]] = 1;
  258. } else {
  259. xdpsplit_t spl;
  260. spl.i1 = spl.i2 = 0;
  261. /*
  262. * Divide ...
  263. */
  264. if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
  265. need_min, &spl, xenv) < 0) {
  266. return -1;
  267. }
  268. /*
  269. * ... et Impera.
  270. */
  271. if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
  272. kvdf, kvdb, spl.min_lo, xenv) < 0 ||
  273. xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
  274. kvdf, kvdb, spl.min_hi, xenv) < 0) {
  275. return -1;
  276. }
  277. }
  278. return 0;
  279. }
  280. int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
  281. xdfenv_t *xe) {
  282. long ndiags;
  283. long *kvd, *kvdf, *kvdb;
  284. xdalgoenv_t xenv;
  285. diffdata_t dd1, dd2;
  286. if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
  287. return xdl_do_patience_diff(mf1, mf2, xpp, xe);
  288. if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
  289. return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
  290. if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
  291. return -1;
  292. }
  293. /*
  294. * Allocate and setup K vectors to be used by the differential
  295. * algorithm.
  296. *
  297. * One is to store the forward path and one to store the backward path.
  298. */
  299. ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
  300. if (!(kvd = (long *) xdl_malloc((2 * ndiags + 2) * sizeof(long)))) {
  301. xdl_free_env(xe);
  302. return -1;
  303. }
  304. kvdf = kvd;
  305. kvdb = kvdf + ndiags;
  306. kvdf += xe->xdf2.nreff + 1;
  307. kvdb += xe->xdf2.nreff + 1;
  308. xenv.mxcost = xdl_bogosqrt(ndiags);
  309. if (xenv.mxcost < XDL_MAX_COST_MIN)
  310. xenv.mxcost = XDL_MAX_COST_MIN;
  311. xenv.snake_cnt = XDL_SNAKE_CNT;
  312. xenv.heur_min = XDL_HEUR_MIN_COST;
  313. dd1.nrec = xe->xdf1.nreff;
  314. dd1.ha = xe->xdf1.ha;
  315. dd1.rchg = xe->xdf1.rchg;
  316. dd1.rindex = xe->xdf1.rindex;
  317. dd2.nrec = xe->xdf2.nreff;
  318. dd2.ha = xe->xdf2.ha;
  319. dd2.rchg = xe->xdf2.rchg;
  320. dd2.rindex = xe->xdf2.rindex;
  321. if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
  322. kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
  323. xdl_free(kvd);
  324. xdl_free_env(xe);
  325. return -1;
  326. }
  327. xdl_free(kvd);
  328. return 0;
  329. }
  330. static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
  331. xdchange_t *xch;
  332. if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
  333. return NULL;
  334. xch->next = xscr;
  335. xch->i1 = i1;
  336. xch->i2 = i2;
  337. xch->chg1 = chg1;
  338. xch->chg2 = chg2;
  339. xch->ignore = 0;
  340. return xch;
  341. }
  342. static int recs_match(xrecord_t *rec1, xrecord_t *rec2, long flags)
  343. {
  344. return (rec1->ha == rec2->ha &&
  345. xdl_recmatch(rec1->ptr, rec1->size,
  346. rec2->ptr, rec2->size,
  347. flags));
  348. }
  349. /*
  350. * If a line is indented more than this, get_indent() just returns this value.
  351. * This avoids having to do absurd amounts of work for data that are not
  352. * human-readable text, and also ensures that the output of get_indent fits
  353. * within an int.
  354. */
  355. #define MAX_INDENT 200
  356. /*
  357. * Return the amount of indentation of the specified line, treating TAB as 8
  358. * columns. Return -1 if line is empty or contains only whitespace. Clamp the
  359. * output value at MAX_INDENT.
  360. */
  361. static int get_indent(xrecord_t *rec)
  362. {
  363. long i;
  364. int ret = 0;
  365. for (i = 0; i < rec->size; i++) {
  366. char c = rec->ptr[i];
  367. if (!XDL_ISSPACE(c))
  368. return ret;
  369. else if (c == ' ')
  370. ret += 1;
  371. else if (c == '\t')
  372. ret += 8 - ret % 8;
  373. /* ignore other whitespace characters */
  374. if (ret >= MAX_INDENT)
  375. return MAX_INDENT;
  376. }
  377. /* The line contains only whitespace. */
  378. return -1;
  379. }
  380. /*
  381. * If more than this number of consecutive blank rows are found, just return
  382. * this value. This avoids requiring O(N^2) work for pathological cases, and
  383. * also ensures that the output of score_split fits in an int.
  384. */
  385. #define MAX_BLANKS 20
  386. /* Characteristics measured about a hypothetical split position. */
  387. struct split_measurement {
  388. /*
  389. * Is the split at the end of the file (aside from any blank lines)?
  390. */
  391. int end_of_file;
  392. /*
  393. * How much is the line immediately following the split indented (or -1
  394. * if the line is blank):
  395. */
  396. int indent;
  397. /*
  398. * How many consecutive lines above the split are blank?
  399. */
  400. int pre_blank;
  401. /*
  402. * How much is the nearest non-blank line above the split indented (or
  403. * -1 if there is no such line)?
  404. */
  405. int pre_indent;
  406. /*
  407. * How many lines after the line following the split are blank?
  408. */
  409. int post_blank;
  410. /*
  411. * How much is the nearest non-blank line after the line following the
  412. * split indented (or -1 if there is no such line)?
  413. */
  414. int post_indent;
  415. };
  416. struct split_score {
  417. /* The effective indent of this split (smaller is preferred). */
  418. int effective_indent;
  419. /* Penalty for this split (smaller is preferred). */
  420. int penalty;
  421. };
  422. /*
  423. * Fill m with information about a hypothetical split of xdf above line split.
  424. */
  425. static void measure_split(const xdfile_t *xdf, long split,
  426. struct split_measurement *m)
  427. {
  428. long i;
  429. if (split >= xdf->nrec) {
  430. m->end_of_file = 1;
  431. m->indent = -1;
  432. } else {
  433. m->end_of_file = 0;
  434. m->indent = get_indent(xdf->recs[split]);
  435. }
  436. m->pre_blank = 0;
  437. m->pre_indent = -1;
  438. for (i = split - 1; i >= 0; i--) {
  439. m->pre_indent = get_indent(xdf->recs[i]);
  440. if (m->pre_indent != -1)
  441. break;
  442. m->pre_blank += 1;
  443. if (m->pre_blank == MAX_BLANKS) {
  444. m->pre_indent = 0;
  445. break;
  446. }
  447. }
  448. m->post_blank = 0;
  449. m->post_indent = -1;
  450. for (i = split + 1; i < xdf->nrec; i++) {
  451. m->post_indent = get_indent(xdf->recs[i]);
  452. if (m->post_indent != -1)
  453. break;
  454. m->post_blank += 1;
  455. if (m->post_blank == MAX_BLANKS) {
  456. m->post_indent = 0;
  457. break;
  458. }
  459. }
  460. }
  461. /*
  462. * The empirically-determined weight factors used by score_split() below.
  463. * Larger values means that the position is a less favorable place to split.
  464. *
  465. * Note that scores are only ever compared against each other, so multiplying
  466. * all of these weight/penalty values by the same factor wouldn't change the
  467. * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
  468. * In practice, these numbers are chosen to be large enough that they can be
  469. * adjusted relative to each other with sufficient precision despite using
  470. * integer math.
  471. */
  472. /* Penalty if there are no non-blank lines before the split */
  473. #define START_OF_FILE_PENALTY 1
  474. /* Penalty if there are no non-blank lines after the split */
  475. #define END_OF_FILE_PENALTY 21
  476. /* Multiplier for the number of blank lines around the split */
  477. #define TOTAL_BLANK_WEIGHT (-30)
  478. /* Multiplier for the number of blank lines after the split */
  479. #define POST_BLANK_WEIGHT 6
  480. /*
  481. * Penalties applied if the line is indented more than its predecessor
  482. */
  483. #define RELATIVE_INDENT_PENALTY (-4)
  484. #define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
  485. /*
  486. * Penalties applied if the line is indented less than both its predecessor and
  487. * its successor
  488. */
  489. #define RELATIVE_OUTDENT_PENALTY 24
  490. #define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
  491. /*
  492. * Penalties applied if the line is indented less than its predecessor but not
  493. * less than its successor
  494. */
  495. #define RELATIVE_DEDENT_PENALTY 23
  496. #define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
  497. /*
  498. * We only consider whether the sum of the effective indents for splits are
  499. * less than (-1), equal to (0), or greater than (+1) each other. The resulting
  500. * value is multiplied by the following weight and combined with the penalty to
  501. * determine the better of two scores.
  502. */
  503. #define INDENT_WEIGHT 60
  504. /*
  505. * How far do we slide a hunk at most?
  506. */
  507. #define INDENT_HEURISTIC_MAX_SLIDING 100
  508. /*
  509. * Compute a badness score for the hypothetical split whose measurements are
  510. * stored in m. The weight factors were determined empirically using the tools
  511. * and corpus described in
  512. *
  513. * https://github.com/mhagger/diff-slider-tools
  514. *
  515. * Also see that project if you want to improve the weights based on, for
  516. * example, a larger or more diverse corpus.
  517. */
  518. static void score_add_split(const struct split_measurement *m, struct split_score *s)
  519. {
  520. /*
  521. * A place to accumulate penalty factors (positive makes this index more
  522. * favored):
  523. */
  524. int post_blank, total_blank, indent, any_blanks;
  525. if (m->pre_indent == -1 && m->pre_blank == 0)
  526. s->penalty += START_OF_FILE_PENALTY;
  527. if (m->end_of_file)
  528. s->penalty += END_OF_FILE_PENALTY;
  529. /*
  530. * Set post_blank to the number of blank lines following the split,
  531. * including the line immediately after the split:
  532. */
  533. post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
  534. total_blank = m->pre_blank + post_blank;
  535. /* Penalties based on nearby blank lines: */
  536. s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
  537. s->penalty += POST_BLANK_WEIGHT * post_blank;
  538. if (m->indent != -1)
  539. indent = m->indent;
  540. else
  541. indent = m->post_indent;
  542. any_blanks = (total_blank != 0);
  543. /* Note that the effective indent is -1 at the end of the file: */
  544. s->effective_indent += indent;
  545. if (indent == -1) {
  546. /* No additional adjustments needed. */
  547. } else if (m->pre_indent == -1) {
  548. /* No additional adjustments needed. */
  549. } else if (indent > m->pre_indent) {
  550. /*
  551. * The line is indented more than its predecessor.
  552. */
  553. s->penalty += any_blanks ?
  554. RELATIVE_INDENT_WITH_BLANK_PENALTY :
  555. RELATIVE_INDENT_PENALTY;
  556. } else if (indent == m->pre_indent) {
  557. /*
  558. * The line has the same indentation level as its predecessor.
  559. * No additional adjustments needed.
  560. */
  561. } else {
  562. /*
  563. * The line is indented less than its predecessor. It could be
  564. * the block terminator of the previous block, but it could
  565. * also be the start of a new block (e.g., an "else" block, or
  566. * maybe the previous block didn't have a block terminator).
  567. * Try to distinguish those cases based on what comes next:
  568. */
  569. if (m->post_indent != -1 && m->post_indent > indent) {
  570. /*
  571. * The following line is indented more. So it is likely
  572. * that this line is the start of a block.
  573. */
  574. s->penalty += any_blanks ?
  575. RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
  576. RELATIVE_OUTDENT_PENALTY;
  577. } else {
  578. /*
  579. * That was probably the end of a block.
  580. */
  581. s->penalty += any_blanks ?
  582. RELATIVE_DEDENT_WITH_BLANK_PENALTY :
  583. RELATIVE_DEDENT_PENALTY;
  584. }
  585. }
  586. }
  587. static int score_cmp(struct split_score *s1, struct split_score *s2)
  588. {
  589. /* -1 if s1.effective_indent < s2->effective_indent, etc. */
  590. int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
  591. (s1->effective_indent < s2->effective_indent));
  592. return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
  593. }
  594. /*
  595. * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
  596. * of lines that was inserted or deleted from the corresponding version of the
  597. * file). We consider there to be such a group at the beginning of the file, at
  598. * the end of the file, and between any two unchanged lines, though most such
  599. * groups will usually be empty.
  600. *
  601. * If the first line in a group is equal to the line following the group, then
  602. * the group can be slid down. Similarly, if the last line in a group is equal
  603. * to the line preceding the group, then the group can be slid up. See
  604. * group_slide_down() and group_slide_up().
  605. *
  606. * Note that loops that are testing for changed lines in xdf->rchg do not need
  607. * index bounding since the array is prepared with a zero at position -1 and N.
  608. */
  609. struct xdlgroup {
  610. /*
  611. * The index of the first changed line in the group, or the index of
  612. * the unchanged line above which the (empty) group is located.
  613. */
  614. long start;
  615. /*
  616. * The index of the first unchanged line after the group. For an empty
  617. * group, end is equal to start.
  618. */
  619. long end;
  620. };
  621. /*
  622. * Initialize g to point at the first group in xdf.
  623. */
  624. static void group_init(xdfile_t *xdf, struct xdlgroup *g)
  625. {
  626. g->start = g->end = 0;
  627. while (xdf->rchg[g->end])
  628. g->end++;
  629. }
  630. /*
  631. * Move g to describe the next (possibly empty) group in xdf and return 0. If g
  632. * is already at the end of the file, do nothing and return -1.
  633. */
  634. static inline int group_next(xdfile_t *xdf, struct xdlgroup *g)
  635. {
  636. if (g->end == xdf->nrec)
  637. return -1;
  638. g->start = g->end + 1;
  639. for (g->end = g->start; xdf->rchg[g->end]; g->end++)
  640. ;
  641. return 0;
  642. }
  643. /*
  644. * Move g to describe the previous (possibly empty) group in xdf and return 0.
  645. * If g is already at the beginning of the file, do nothing and return -1.
  646. */
  647. static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g)
  648. {
  649. if (g->start == 0)
  650. return -1;
  651. g->end = g->start - 1;
  652. for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
  653. ;
  654. return 0;
  655. }
  656. /*
  657. * If g can be slid toward the end of the file, do so, and if it bumps into a
  658. * following group, expand this group to include it. Return 0 on success or -1
  659. * if g cannot be slid down.
  660. */
  661. static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g, long flags)
  662. {
  663. if (g->end < xdf->nrec &&
  664. recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
  665. xdf->rchg[g->start++] = 0;
  666. xdf->rchg[g->end++] = 1;
  667. while (xdf->rchg[g->end])
  668. g->end++;
  669. return 0;
  670. } else {
  671. return -1;
  672. }
  673. }
  674. /*
  675. * If g can be slid toward the beginning of the file, do so, and if it bumps
  676. * into a previous group, expand this group to include it. Return 0 on success
  677. * or -1 if g cannot be slid up.
  678. */
  679. static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g, long flags)
  680. {
  681. if (g->start > 0 &&
  682. recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
  683. xdf->rchg[--g->start] = 1;
  684. xdf->rchg[--g->end] = 0;
  685. while (xdf->rchg[g->start - 1])
  686. g->start--;
  687. return 0;
  688. } else {
  689. return -1;
  690. }
  691. }
  692. static void xdl_bug(const char *msg)
  693. {
  694. fprintf(stderr, "BUG: %s\n", msg);
  695. exit(1);
  696. }
  697. /*
  698. * Move back and forward change groups for a consistent and pretty diff output.
  699. * This also helps in finding joinable change groups and reducing the diff
  700. * size.
  701. */
  702. int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
  703. struct xdlgroup g, go;
  704. long earliest_end, end_matching_other;
  705. long groupsize;
  706. group_init(xdf, &g);
  707. group_init(xdfo, &go);
  708. while (1) {
  709. /*
  710. * If the group is empty in the to-be-compacted file, skip it:
  711. */
  712. if (g.end == g.start)
  713. goto next;
  714. /*
  715. * Now shift the change up and then down as far as possible in
  716. * each direction. If it bumps into any other changes, merge
  717. * them.
  718. */
  719. do {
  720. groupsize = g.end - g.start;
  721. /*
  722. * Keep track of the last "end" index that causes this
  723. * group to align with a group of changed lines in the
  724. * other file. -1 indicates that we haven't found such
  725. * a match yet:
  726. */
  727. end_matching_other = -1;
  728. /* Shift the group backward as much as possible: */
  729. while (!group_slide_up(xdf, &g, flags))
  730. if (group_previous(xdfo, &go))
  731. xdl_bug("group sync broken sliding up");
  732. /*
  733. * This is this highest that this group can be shifted.
  734. * Record its end index:
  735. */
  736. earliest_end = g.end;
  737. if (go.end > go.start)
  738. end_matching_other = g.end;
  739. /* Now shift the group forward as far as possible: */
  740. while (1) {
  741. if (group_slide_down(xdf, &g, flags))
  742. break;
  743. if (group_next(xdfo, &go))
  744. xdl_bug("group sync broken sliding down");
  745. if (go.end > go.start)
  746. end_matching_other = g.end;
  747. }
  748. } while (groupsize != g.end - g.start);
  749. /*
  750. * If the group can be shifted, then we can possibly use this
  751. * freedom to produce a more intuitive diff.
  752. *
  753. * The group is currently shifted as far down as possible, so
  754. * the heuristics below only have to handle upwards shifts.
  755. */
  756. if (g.end == earliest_end) {
  757. /* no shifting was possible */
  758. } else if (end_matching_other != -1) {
  759. /*
  760. * Move the possibly merged group of changes back to
  761. * line up with the last group of changes from the
  762. * other file that it can align with.
  763. */
  764. while (go.end == go.start) {
  765. if (group_slide_up(xdf, &g, flags))
  766. xdl_bug("match disappeared");
  767. if (group_previous(xdfo, &go))
  768. xdl_bug("group sync broken sliding to match");
  769. }
  770. } else if (flags & XDF_INDENT_HEURISTIC) {
  771. /*
  772. * Indent heuristic: a group of pure add/delete lines
  773. * implies two splits, one between the end of the
  774. * "before" context and the start of the group, and
  775. * another between the end of the group and the
  776. * beginning of the "after" context. Some splits are
  777. * aesthetically better and some are worse. We compute
  778. * a badness "score" for each split, and add the scores
  779. * for the two splits to define a "score" for each
  780. * position that the group can be shifted to. Then we
  781. * pick the shift with the lowest score.
  782. */
  783. long shift, best_shift = -1;
  784. struct split_score best_score;
  785. shift = earliest_end;
  786. if (g.end - groupsize - 1 > shift)
  787. shift = g.end - groupsize - 1;
  788. if (g.end - INDENT_HEURISTIC_MAX_SLIDING > shift)
  789. shift = g.end - INDENT_HEURISTIC_MAX_SLIDING;
  790. for (; shift <= g.end; shift++) {
  791. struct split_measurement m;
  792. struct split_score score = {0, 0};
  793. measure_split(xdf, shift, &m);
  794. score_add_split(&m, &score);
  795. measure_split(xdf, shift - groupsize, &m);
  796. score_add_split(&m, &score);
  797. if (best_shift == -1 ||
  798. score_cmp(&score, &best_score) <= 0) {
  799. best_score.effective_indent = score.effective_indent;
  800. best_score.penalty = score.penalty;
  801. best_shift = shift;
  802. }
  803. }
  804. while (g.end > best_shift) {
  805. if (group_slide_up(xdf, &g, flags))
  806. xdl_bug("best shift unreached");
  807. if (group_previous(xdfo, &go))
  808. xdl_bug("group sync broken sliding to blank line");
  809. }
  810. }
  811. next:
  812. /* Move past the just-processed group: */
  813. if (group_next(xdf, &g))
  814. break;
  815. if (group_next(xdfo, &go))
  816. xdl_bug("group sync broken moving to next group");
  817. }
  818. if (!group_next(xdfo, &go))
  819. xdl_bug("group sync broken at end of file");
  820. return 0;
  821. }
  822. int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
  823. xdchange_t *cscr = NULL, *xch;
  824. char *rchg1 = xe->xdf1.rchg, *rchg2 = xe->xdf2.rchg;
  825. long i1, i2, l1, l2;
  826. /*
  827. * Trivial. Collects "groups" of changes and creates an edit script.
  828. */
  829. for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
  830. if (rchg1[i1 - 1] || rchg2[i2 - 1]) {
  831. for (l1 = i1; rchg1[i1 - 1]; i1--);
  832. for (l2 = i2; rchg2[i2 - 1]; i2--);
  833. if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
  834. xdl_free_script(cscr);
  835. return -1;
  836. }
  837. cscr = xch;
  838. }
  839. *xscr = cscr;
  840. return 0;
  841. }
  842. void xdl_free_script(xdchange_t *xscr) {
  843. xdchange_t *xch;
  844. while ((xch = xscr) != NULL) {
  845. xscr = xscr->next;
  846. xdl_free(xch);
  847. }
  848. }
  849. static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
  850. xdemitconf_t const *xecfg)
  851. {
  852. xdchange_t *xch, *xche;
  853. for (xch = xscr; xch; xch = xche->next) {
  854. xche = xdl_get_hunk(&xch, xecfg);
  855. if (!xch)
  856. break;
  857. if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
  858. xch->i2, xche->i2 + xche->chg2 - xch->i2,
  859. ecb->priv) < 0)
  860. return -1;
  861. }
  862. return 0;
  863. }
  864. static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
  865. {
  866. xdchange_t *xch;
  867. for (xch = xscr; xch; xch = xch->next) {
  868. int ignore = 1;
  869. xrecord_t **rec;
  870. long i;
  871. rec = &xe->xdf1.recs[xch->i1];
  872. for (i = 0; i < xch->chg1 && ignore; i++)
  873. ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
  874. rec = &xe->xdf2.recs[xch->i2];
  875. for (i = 0; i < xch->chg2 && ignore; i++)
  876. ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
  877. xch->ignore = ignore;
  878. }
  879. }
  880. int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
  881. xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
  882. xdchange_t *xscr;
  883. xdfenv_t xe;
  884. emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
  885. if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
  886. return -1;
  887. }
  888. if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
  889. xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
  890. xdl_build_script(&xe, &xscr) < 0) {
  891. xdl_free_env(&xe);
  892. return -1;
  893. }
  894. if (xscr) {
  895. if (xpp->flags & XDF_IGNORE_BLANK_LINES)
  896. xdl_mark_ignorable(xscr, &xe, xpp->flags);
  897. if (ef(&xe, xscr, ecb, xecfg) < 0) {
  898. xdl_free_script(xscr);
  899. xdl_free_env(&xe);
  900. return -1;
  901. }
  902. xdl_free_script(xscr);
  903. }
  904. xdl_free_env(&xe);
  905. return 0;
  906. }