THIS IS A TEST INSTANCE ONLY! REPOSITORIES CAN BE DELETED AT ANY TIME!

Git Source Code Mirror - This is a publish-only repository and all pull requests are ignored. Please follow Documentation/SubmittingPatches procedure for any of your improvements.
git
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
5.5KB

  1. #include "cache.h"
  2. #include "diff.h"
  3. #include "diffcore.h"
  4. /*
  5. * Idea here is very simple.
  6. *
  7. * Almost all data we are interested in are text, but sometimes we have
  8. * to deal with binary data. So we cut them into chunks delimited by
  9. * LF byte, or 64-byte sequence, whichever comes first, and hash them.
  10. *
  11. * For those chunks, if the source buffer has more instances of it
  12. * than the destination buffer, that means the difference are the
  13. * number of bytes not copied from source to destination. If the
  14. * counts are the same, everything was copied from source to
  15. * destination. If the destination has more, everything was copied,
  16. * and destination added more.
  17. *
  18. * We are doing an approximation so we do not really have to waste
  19. * memory by actually storing the sequence. We just hash them into
  20. * somewhere around 2^16 hashbuckets and count the occurrences.
  21. */
  22. /* Wild guess at the initial hash size */
  23. #define INITIAL_HASH_SIZE 9
  24. /* We leave more room in smaller hash but do not let it
  25. * grow to have unused hole too much.
  26. */
  27. #define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))
  28. /* A prime rather carefully chosen between 2^16..2^17, so that
  29. * HASHBASE < INITIAL_FREE(17). We want to keep the maximum hashtable
  30. * size under the current 2<<17 maximum, which can hold this many
  31. * different values before overflowing to hashtable of size 2<<18.
  32. */
  33. #define HASHBASE 107927
  34. struct spanhash {
  35. unsigned int hashval;
  36. unsigned int cnt;
  37. };
  38. struct spanhash_top {
  39. int alloc_log2;
  40. int free;
  41. struct spanhash data[FLEX_ARRAY];
  42. };
  43. static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
  44. {
  45. struct spanhash_top *new_spanhash;
  46. int i;
  47. int osz = 1 << orig->alloc_log2;
  48. int sz = osz << 1;
  49. new_spanhash = xmalloc(st_add(sizeof(*orig),
  50. st_mult(sizeof(struct spanhash), sz)));
  51. new_spanhash->alloc_log2 = orig->alloc_log2 + 1;
  52. new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);
  53. memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);
  54. for (i = 0; i < osz; i++) {
  55. struct spanhash *o = &(orig->data[i]);
  56. int bucket;
  57. if (!o->cnt)
  58. continue;
  59. bucket = o->hashval & (sz - 1);
  60. while (1) {
  61. struct spanhash *h = &(new_spanhash->data[bucket++]);
  62. if (!h->cnt) {
  63. h->hashval = o->hashval;
  64. h->cnt = o->cnt;
  65. new_spanhash->free--;
  66. break;
  67. }
  68. if (sz <= bucket)
  69. bucket = 0;
  70. }
  71. }
  72. free(orig);
  73. return new_spanhash;
  74. }
  75. static struct spanhash_top *add_spanhash(struct spanhash_top *top,
  76. unsigned int hashval, int cnt)
  77. {
  78. int bucket, lim;
  79. struct spanhash *h;
  80. lim = (1 << top->alloc_log2);
  81. bucket = hashval & (lim - 1);
  82. while (1) {
  83. h = &(top->data[bucket++]);
  84. if (!h->cnt) {
  85. h->hashval = hashval;
  86. h->cnt = cnt;
  87. top->free--;
  88. if (top->free < 0)
  89. return spanhash_rehash(top);
  90. return top;
  91. }
  92. if (h->hashval == hashval) {
  93. h->cnt += cnt;
  94. return top;
  95. }
  96. if (lim <= bucket)
  97. bucket = 0;
  98. }
  99. }
  100. static int spanhash_cmp(const void *a_, const void *b_)
  101. {
  102. const struct spanhash *a = a_;
  103. const struct spanhash *b = b_;
  104. /* A count of zero compares at the end.. */
  105. if (!a->cnt)
  106. return !b->cnt ? 0 : 1;
  107. if (!b->cnt)
  108. return -1;
  109. return a->hashval < b->hashval ? -1 :
  110. a->hashval > b->hashval ? 1 : 0;
  111. }
  112. static struct spanhash_top *hash_chars(struct repository *r,
  113. struct diff_filespec *one)
  114. {
  115. int i, n;
  116. unsigned int accum1, accum2, hashval;
  117. struct spanhash_top *hash;
  118. unsigned char *buf = one->data;
  119. unsigned int sz = one->size;
  120. int is_text = !diff_filespec_is_binary(r, one);
  121. i = INITIAL_HASH_SIZE;
  122. hash = xmalloc(st_add(sizeof(*hash),
  123. st_mult(sizeof(struct spanhash), 1<<i)));
  124. hash->alloc_log2 = i;
  125. hash->free = INITIAL_FREE(i);
  126. memset(hash->data, 0, sizeof(struct spanhash) * (1<<i));
  127. n = 0;
  128. accum1 = accum2 = 0;
  129. while (sz) {
  130. unsigned int c = *buf++;
  131. unsigned int old_1 = accum1;
  132. sz--;
  133. /* Ignore CR in CRLF sequence if text */
  134. if (is_text && c == '\r' && sz && *buf == '\n')
  135. continue;
  136. accum1 = (accum1 << 7) ^ (accum2 >> 25);
  137. accum2 = (accum2 << 7) ^ (old_1 >> 25);
  138. accum1 += c;
  139. if (++n < 64 && c != '\n')
  140. continue;
  141. hashval = (accum1 + accum2 * 0x61) % HASHBASE;
  142. hash = add_spanhash(hash, hashval, n);
  143. n = 0;
  144. accum1 = accum2 = 0;
  145. }
  146. QSORT(hash->data, 1ul << hash->alloc_log2, spanhash_cmp);
  147. return hash;
  148. }
  149. int diffcore_count_changes(struct repository *r,
  150. struct diff_filespec *src,
  151. struct diff_filespec *dst,
  152. void **src_count_p,
  153. void **dst_count_p,
  154. unsigned long *src_copied,
  155. unsigned long *literal_added)
  156. {
  157. struct spanhash *s, *d;
  158. struct spanhash_top *src_count, *dst_count;
  159. unsigned long sc, la;
  160. src_count = dst_count = NULL;
  161. if (src_count_p)
  162. src_count = *src_count_p;
  163. if (!src_count) {
  164. src_count = hash_chars(r, src);
  165. if (src_count_p)
  166. *src_count_p = src_count;
  167. }
  168. if (dst_count_p)
  169. dst_count = *dst_count_p;
  170. if (!dst_count) {
  171. dst_count = hash_chars(r, dst);
  172. if (dst_count_p)
  173. *dst_count_p = dst_count;
  174. }
  175. sc = la = 0;
  176. s = src_count->data;
  177. d = dst_count->data;
  178. for (;;) {
  179. unsigned dst_cnt, src_cnt;
  180. if (!s->cnt)
  181. break; /* we checked all in src */
  182. while (d->cnt) {
  183. if (d->hashval >= s->hashval)
  184. break;
  185. la += d->cnt;
  186. d++;
  187. }
  188. src_cnt = s->cnt;
  189. dst_cnt = 0;
  190. if (d->cnt && d->hashval == s->hashval) {
  191. dst_cnt = d->cnt;
  192. d++;
  193. }
  194. if (src_cnt < dst_cnt) {
  195. la += dst_cnt - src_cnt;
  196. sc += src_cnt;
  197. }
  198. else
  199. sc += dst_cnt;
  200. s++;
  201. }
  202. while (d->cnt) {
  203. la += d->cnt;
  204. d++;
  205. }
  206. if (!src_count_p)
  207. free(src_count);
  208. if (!dst_count_p)
  209. free(dst_count);
  210. *src_copied = sc;
  211. *literal_added = la;
  212. return 0;
  213. }