diffcore-rename.c 14.5 KB
Newer Older
1 2 3 4 5 6 7
/*
 * Copyright (C) 2005 Junio C Hamano
 */
#include "cache.h"
#include "diff.h"
#include "diffcore.h"

8 9 10 11 12 13 14
/* Table of rename/copy destinations */

static struct diff_rename_dst {
	struct diff_filespec *two;
	struct diff_filepair *pair;
} *rename_dst;
static int rename_dst_nr, rename_dst_alloc;
15

16 17
static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
						 int insert_ok)
18
{
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
	int first, last;

	first = 0;
	last = rename_dst_nr;
	while (last > first) {
		int next = (last + first) >> 1;
		struct diff_rename_dst *dst = &(rename_dst[next]);
		int cmp = strcmp(two->path, dst->two->path);
		if (!cmp)
			return dst;
		if (cmp < 0) {
			last = next;
			continue;
		}
		first = next+1;
	}
	/* not found */
	if (!insert_ok)
		return NULL;
	/* insert to make it at "first" */
	if (rename_dst_alloc <= rename_dst_nr) {
		rename_dst_alloc = alloc_nr(rename_dst_alloc);
		rename_dst = xrealloc(rename_dst,
				      rename_dst_alloc * sizeof(*rename_dst));
	}
	rename_dst_nr++;
	if (first < rename_dst_nr)
		memmove(rename_dst + first + 1, rename_dst + first,
			(rename_dst_nr - first - 1) * sizeof(*rename_dst));
J
Junio C Hamano 已提交
48 49
	rename_dst[first].two = alloc_filespec(two->path);
	fill_filespec(rename_dst[first].two, two->sha1, two->mode);
50 51
	rename_dst[first].pair = NULL;
	return &(rename_dst[first]);
52 53
}

54
/* Table of rename/copy src files */
55 56
static struct diff_rename_src {
	struct diff_filespec *one;
57
	unsigned short score; /* to remember the break score */
58
	unsigned src_path_left : 1;
59 60
} *rename_src;
static int rename_src_nr, rename_src_alloc;
61

62
static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
63 64
						   int src_path_left,
						   unsigned short score)
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
{
	int first, last;

	first = 0;
	last = rename_src_nr;
	while (last > first) {
		int next = (last + first) >> 1;
		struct diff_rename_src *src = &(rename_src[next]);
		int cmp = strcmp(one->path, src->one->path);
		if (!cmp)
			return src;
		if (cmp < 0) {
			last = next;
			continue;
		}
		first = next+1;
	}
82

83 84 85 86 87
	/* insert to make it at "first" */
	if (rename_src_alloc <= rename_src_nr) {
		rename_src_alloc = alloc_nr(rename_src_alloc);
		rename_src = xrealloc(rename_src,
				      rename_src_alloc * sizeof(*rename_src));
88
	}
89 90 91 92 93
	rename_src_nr++;
	if (first < rename_src_nr)
		memmove(rename_src + first + 1, rename_src + first,
			(rename_src_nr - first - 1) * sizeof(*rename_src));
	rename_src[first].one = one;
94
	rename_src[first].score = score;
95
	rename_src[first].src_path_left = src_path_left;
96
	return &(rename_src[first]);
97 98
}

99 100 101
static int is_exact_match(struct diff_filespec *src,
			  struct diff_filespec *dst,
			  int contents_too)
102 103
{
	if (src->sha1_valid && dst->sha1_valid &&
104
	    !hashcmp(src->sha1, dst->sha1))
105
		return 1;
106 107
	if (!contents_too)
		return 0;
108 109 110 111
	if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
		return 0;
	if (src->size != dst->size)
		return 0;
112 113
	if (src->sha1_valid && dst->sha1_valid)
	    return !hashcmp(src->sha1, dst->sha1);
114
	if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
115 116 117 118 119 120 121
		return 0;
	if (src->size == dst->size &&
	    !memcmp(src->data, dst->data, src->size))
		return 1;
	return 0;
}

122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
{
	int src_len = strlen(src->path), dst_len = strlen(dst->path);
	while (src_len && dst_len) {
		char c1 = src->path[--src_len];
		char c2 = dst->path[--dst_len];
		if (c1 != c2)
			return 0;
		if (c1 == '/')
			return 1;
	}
	return (!src_len || src->path[src_len - 1] == '/') &&
		(!dst_len || dst->path[dst_len - 1] == '/');
}

137
struct diff_score {
138 139
	int src; /* index in rename_src */
	int dst; /* index in rename_dst */
140
	int score;
141
	int name_score;
142 143 144 145 146 147 148 149 150 151 152 153 154
};

static int estimate_similarity(struct diff_filespec *src,
			       struct diff_filespec *dst,
			       int minimum_score)
{
	/* src points at a file that existed in the original tree (or
	 * optionally a file in the destination tree) and dst points
	 * at a newly created file.  They may be quite similar, in which
	 * case we want to say src is renamed to dst or src is copied into
	 * dst, and then some edit has been applied to dst.
	 *
	 * Compare them and return how similar they are, representing
155 156 157 158 159
	 * the score as an integer between 0 and MAX_SCORE.
	 *
	 * When there is an exact match, it is considered a better
	 * match than anything else; the destination does not even
	 * call into this function in that case.
160
	 */
L
Linus Torvalds 已提交
161
	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
162
	unsigned long delta_limit;
163 164
	int score;

165 166 167 168 169 170 171
	/* We deal only with regular files.  Symlink renames are handled
	 * only when they are exact matches --- in other words, no edits
	 * after renaming.
	 */
	if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
		return 0;

L
Linus Torvalds 已提交
172
	max_size = ((src->size > dst->size) ? src->size : dst->size);
173
	base_size = ((src->size < dst->size) ? src->size : dst->size);
L
Linus Torvalds 已提交
174
	delta_size = max_size - base_size;
175

176 177
	/* We would not consider edits that change the file size so
	 * drastically.  delta_size must be smaller than
178
	 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
179
	 *
180 181 182
	 * Note that base_size == 0 case is handled here already
	 * and the final score computation below would not have a
	 * divide-by-zero issue.
183
	 */
184
	if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
185 186
		return 0;

J
Jeff King 已提交
187 188
	if ((!src->cnt_data && diff_populate_filespec(src, 0))
		|| (!dst->cnt_data && diff_populate_filespec(dst, 0)))
189 190
		return 0; /* error but caught downstream */

191

192 193
	delta_limit = (unsigned long)
		(base_size * (MAX_SCORE-minimum_score) / MAX_SCORE);
194
	if (diffcore_count_changes(src, dst,
195
				   &src->cnt_data, &dst->cnt_data,
196 197
				   delta_limit,
				   &src_copied, &literal_added))
198
		return 0;
199

200 201
	/* How similar are they?
	 * what percentage of material in dst are from source?
202
	 */
L
Linus Torvalds 已提交
203
	if (!dst->size)
204
		score = 0; /* should not happen */
205
	else
206
		score = (int)(src_copied * MAX_SCORE / max_size);
207 208 209
	return score;
}

J
Junio C Hamano 已提交
210
static void record_rename_pair(int dst_index, int src_index, int score)
211
{
212
	struct diff_filespec *src, *dst;
213
	struct diff_filepair *dp;
214

215 216
	if (rename_dst[dst_index].pair)
		die("internal error: dst already matched.");
217

218
	src = rename_src[src_index].one;
219
	src->count++;
220

221
	dst = rename_dst[dst_index].two;
222
	dst->count++;
223

224
	dp = diff_queue(NULL, src, dst);
225
	dp->renamed_pair = 1;
226 227 228 229
	if (!strcmp(src->path, dst->path))
		dp->score = rename_src[src_index].score;
	else
		dp->score = score;
230
	dp->source_stays = rename_src[src_index].src_path_left;
231
	rename_dst[dst_index].pair = dp;
232 233 234 235
}

/*
 * We sort the rename similarity matrix with the score, in descending
236
 * order (the most similar first).
237 238 239 240
 */
static int score_compare(const void *a_, const void *b_)
{
	const struct diff_score *a = a_, *b = b_;
241 242 243 244

	if (a->score == b->score)
		return b->name_score - a->name_score;

245 246 247
	return b->score - a->score;
}

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
static int compute_stays(struct diff_queue_struct *q,
			 struct diff_filespec *one)
{
	int i;
	for (i = 0; i < q->nr; i++) {
		struct diff_filepair *p = q->queue[i];
		if (strcmp(one->path, p->two->path))
			continue;
		if (DIFF_PAIR_RENAME(p)) {
			return 0; /* something else is renamed into this */
		}
	}
	return 1;
}

263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
/*
 * Find exact renames first.
 *
 * The first round matches up the up-to-date entries,
 * and then during the second round we try to match
 * cache-dirty entries as well.
 *
 * Note: the rest of the rename logic depends on this
 * phase also populating all the filespecs for any
 * entry that isn't matched up with an exact rename,
 * see "is_exact_match()".
 */
static int find_exact_renames(void)
{
	int rename_count = 0;
	int contents_too;

	for (contents_too = 0; contents_too < 2; contents_too++) {
		int i;

		for (i = 0; i < rename_dst_nr; i++) {
			struct diff_filespec *two = rename_dst[i].two;
			int j;

			if (rename_dst[i].pair)
				continue; /* dealt with an earlier round */
			for (j = 0; j < rename_src_nr; j++) {
				int k;
				struct diff_filespec *one = rename_src[j].one;
				if (!is_exact_match(one, two, contents_too))
					continue;

				/* see if there is a basename match, too */
				for (k = j; k < rename_src_nr; k++) {
					one = rename_src[k].one;
					if (basename_same(one, two) &&
						is_exact_match(one, two,
							contents_too)) {
						j = k;
						break;
					}
				}

				record_rename_pair(i, j, (int)MAX_SCORE);
				rename_count++;
				break; /* we are done with this entry */
			}
		}
	}
	return rename_count;
}

315
void diffcore_rename(struct diff_options *options)
316
{
317 318 319
	int detect_rename = options->detect_rename;
	int minimum_score = options->rename_score;
	int rename_limit = options->rename_limit;
320
	struct diff_queue_struct *q = &diff_queued_diff;
J
Junio C Hamano 已提交
321
	struct diff_queue_struct outq;
322
	struct diff_score *mx;
323
	int i, j, rename_count;
324
	int num_create, num_src, dst_cnt;
325

326
	if (!minimum_score)
327
		minimum_score = DEFAULT_RENAME_SCORE;
328 329

	for (i = 0; i < q->nr; i++) {
330
		struct diff_filepair *p = q->queue[i];
331
		if (!DIFF_FILE_VALID(p->one)) {
332
			if (!DIFF_FILE_VALID(p->two))
333
				continue; /* unmerged */
334 335 336
			else if (options->single_follow &&
				 strcmp(options->single_follow, p->two->path))
				continue; /* not interested */
337
			else
338
				locate_rename_dst(p->two, 1);
339
		}
340 341 342 343 344 345
		else if (!DIFF_FILE_VALID(p->two)) {
			/* If the source is a broken "delete", and
			 * they did not really want to get broken,
			 * that means the source actually stays.
			 */
			int stays = (p->broken_pair && !p->score);
346
			register_rename_src(p->one, stays, p->score);
347
		}
348
		else if (detect_rename == DIFF_DETECT_COPY)
349
			register_rename_src(p->one, 1, p->score);
350
	}
351
	if (rename_dst_nr == 0 || rename_src_nr == 0)
352 353
		goto cleanup; /* nothing to do */

354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
	/*
	 * This basically does a test for the rename matrix not
	 * growing larger than a "rename_limit" square matrix, ie:
	 *
	 *    rename_dst_nr * rename_src_nr > rename_limit * rename_limit
	 *
	 * but handles the potential overflow case specially (and we
	 * assume at least 32-bit integers)
	 */
	if (rename_limit <= 0 || rename_limit > 32767)
		rename_limit = 32767;
	if (rename_dst_nr > rename_limit && rename_src_nr > rename_limit)
		goto cleanup;
	if (rename_dst_nr * rename_src_nr > rename_limit * rename_limit)
		goto cleanup;

370 371
	/*
	 * We really want to cull the candidates list early
372 373
	 * with cheap tests in order to avoid doing deltas.
	 */
374
	rename_count = find_exact_renames();
375 376 377 378

	/* Have we run out the created file pool?  If so we can avoid
	 * doing the delta matrix altogether.
	 */
J
Junio C Hamano 已提交
379
	if (rename_count == rename_dst_nr)
380
		goto cleanup;
381

382 383 384
	if (minimum_score == MAX_SCORE)
		goto cleanup;

J
Junio C Hamano 已提交
385
	num_create = (rename_dst_nr - rename_count);
386
	num_src = rename_src_nr;
387
	mx = xmalloc(sizeof(*mx) * num_create * num_src);
388
	for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
389
		int base = dst_cnt * num_src;
390 391
		struct diff_filespec *two = rename_dst[i].two;
		if (rename_dst[i].pair)
392
			continue; /* dealt with exact match already. */
393 394 395 396 397 398 399
		for (j = 0; j < rename_src_nr; j++) {
			struct diff_filespec *one = rename_src[j].one;
			struct diff_score *m = &mx[base+j];
			m->src = j;
			m->dst = i;
			m->score = estimate_similarity(one, two,
						       minimum_score);
400
			m->name_score = basename_same(one, two);
401
			diff_free_filespec_blob(one);
402
		}
403
		/* We do not need the text anymore */
404
		diff_free_filespec_blob(two);
405 406 407 408 409
		dst_cnt++;
	}
	/* cost matrix sorted by most to least similar pair */
	qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
	for (i = 0; i < num_create * num_src; i++) {
410 411 412
		struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
		if (dst->pair)
			continue; /* already done, either exact or fuzzy. */
413
		if (mx[i].score < minimum_score)
414
			break; /* there is no more usable pair. */
J
Junio C Hamano 已提交
415 416
		record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
		rename_count++;
417 418 419
	}
	free(mx);

420
 cleanup:
421
	/* At this point, we have found some renames and copies and they
J
Junio C Hamano 已提交
422
	 * are recorded in rename_dst.  The original list is still in *q.
423
	 */
424 425
	outq.queue = NULL;
	outq.nr = outq.alloc = 0;
426
	for (i = 0; i < q->nr; i++) {
427 428 429
		struct diff_filepair *p = q->queue[i];
		struct diff_filepair *pair_to_free = NULL;

430 431 432 433 434 435 436 437 438 439
		if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
			/*
			 * Creation
			 *
			 * We would output this create record if it has
			 * not been turned into a rename/copy already.
			 */
			struct diff_rename_dst *dst =
				locate_rename_dst(p->two, 0);
			if (dst && dst->pair) {
440 441 442 443
				diff_q(&outq, dst->pair);
				pair_to_free = p;
			}
			else
444 445
				/* no matching rename/copy source, so
				 * record this as a creation.
446 447
				 */
				diff_q(&outq, p);
448
		}
449 450 451 452
		else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
			/*
			 * Deletion
			 *
453 454 455 456
			 * We would output this delete record if:
			 *
			 * (1) this is a broken delete and the counterpart
			 *     broken create remains in the output; or
J
Junio C Hamano 已提交
457 458 459
			 * (2) this is not a broken delete, and rename_dst
			 *     does not have a rename/copy to move p->one->path
			 *     out of existence.
460 461 462 463 464
			 *
			 * Otherwise, the counterpart broken create
			 * has been turned into a rename-edit; or
			 * delete did not have a matching create to
			 * begin with.
465
			 */
466 467 468 469 470 471 472 473 474
			if (DIFF_PAIR_BROKEN(p)) {
				/* broken delete */
				struct diff_rename_dst *dst =
					locate_rename_dst(p->one, 0);
				if (dst && dst->pair)
					/* counterpart is now rename/copy */
					pair_to_free = p;
			}
			else {
J
Junio C Hamano 已提交
475 476 477 478 479 480 481 482 483 484
				for (j = 0; j < rename_dst_nr; j++) {
					if (!rename_dst[j].pair)
						continue;
					if (strcmp(rename_dst[j].pair->
						   one->path,
						   p->one->path))
						continue;
					break;
				}
				if (j < rename_dst_nr)
485 486 487
					/* this path remains */
					pair_to_free = p;
			}
488 489 490 491 492 493

			if (pair_to_free)
				;
			else
				diff_q(&outq, p);
		}
494
		else if (!diff_unmodified_pair(p))
495
			/* all the usual ones need to be kept */
496
			diff_q(&outq, p);
497 498 499 500
		else
			/* no need to keep unmodified pairs */
			pair_to_free = p;

501 502
		if (pair_to_free)
			diff_free_filepair(pair_to_free);
503
	}
504
	diff_debug_queue("done copying original", &outq);
505

506 507 508
	free(q->queue);
	*q = outq;
	diff_debug_queue("done collapsing", q);
509

510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
	/* We need to see which rename source really stays here;
	 * earlier we only checked if the path is left in the result,
	 * but even if a path remains in the result, if that is coming
	 * from copying something else on top of it, then the original
	 * source is lost and does not stay.
	 */
	for (i = 0; i < q->nr; i++) {
		struct diff_filepair *p = q->queue[i];
		if (DIFF_PAIR_RENAME(p) && p->source_stays) {
			/* If one appears as the target of a rename-copy,
			 * then mark p->source_stays = 0; otherwise
			 * leave it as is.
			 */
			p->source_stays = compute_stays(q, p->one);
		}
	}

527 528
	for (i = 0; i < rename_dst_nr; i++)
		free_filespec(rename_dst[i].two);
J
Junio C Hamano 已提交
529

530 531 532 533 534 535
	free(rename_dst);
	rename_dst = NULL;
	rename_dst_nr = rename_dst_alloc = 0;
	free(rename_src);
	rename_src = NULL;
	rename_src_nr = rename_src_alloc = 0;
536 537
	return;
}