diffcore-rename.c 13.5 KB
Newer Older
1 2 3 4 5 6 7
/*
 * Copyright (C) 2005 Junio C Hamano
 */
#include "cache.h"
#include "diff.h"
#include "diffcore.h"

8 9 10 11 12 13 14
/* Table of rename/copy destinations */

static struct diff_rename_dst {
	struct diff_filespec *two;
	struct diff_filepair *pair;
} *rename_dst;
static int rename_dst_nr, rename_dst_alloc;
15

16 17
static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
						 int insert_ok)
18
{
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
	int first, last;

	first = 0;
	last = rename_dst_nr;
	while (last > first) {
		int next = (last + first) >> 1;
		struct diff_rename_dst *dst = &(rename_dst[next]);
		int cmp = strcmp(two->path, dst->two->path);
		if (!cmp)
			return dst;
		if (cmp < 0) {
			last = next;
			continue;
		}
		first = next+1;
	}
	/* not found */
	if (!insert_ok)
		return NULL;
	/* insert to make it at "first" */
	if (rename_dst_alloc <= rename_dst_nr) {
		rename_dst_alloc = alloc_nr(rename_dst_alloc);
		rename_dst = xrealloc(rename_dst,
				      rename_dst_alloc * sizeof(*rename_dst));
	}
	rename_dst_nr++;
	if (first < rename_dst_nr)
		memmove(rename_dst + first + 1, rename_dst + first,
			(rename_dst_nr - first - 1) * sizeof(*rename_dst));
J
Junio C Hamano 已提交
48 49
	rename_dst[first].two = alloc_filespec(two->path);
	fill_filespec(rename_dst[first].two, two->sha1, two->mode);
50 51
	rename_dst[first].pair = NULL;
	return &(rename_dst[first]);
52 53
}

54
/* Table of rename/copy src files */
55 56
static struct diff_rename_src {
	struct diff_filespec *one;
57
	unsigned short score; /* to remember the break score */
58 59
} *rename_src;
static int rename_src_nr, rename_src_alloc;
60

61
static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
62
						   unsigned short score)
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
{
	int first, last;

	first = 0;
	last = rename_src_nr;
	while (last > first) {
		int next = (last + first) >> 1;
		struct diff_rename_src *src = &(rename_src[next]);
		int cmp = strcmp(one->path, src->one->path);
		if (!cmp)
			return src;
		if (cmp < 0) {
			last = next;
			continue;
		}
		first = next+1;
	}
80

81 82 83 84 85
	/* insert to make it at "first" */
	if (rename_src_alloc <= rename_src_nr) {
		rename_src_alloc = alloc_nr(rename_src_alloc);
		rename_src = xrealloc(rename_src,
				      rename_src_alloc * sizeof(*rename_src));
86
	}
87 88 89 90 91
	rename_src_nr++;
	if (first < rename_src_nr)
		memmove(rename_src + first + 1, rename_src + first,
			(rename_src_nr - first - 1) * sizeof(*rename_src));
	rename_src[first].one = one;
92
	rename_src[first].score = score;
93
	return &(rename_src[first]);
94 95
}

96 97 98
static int is_exact_match(struct diff_filespec *src,
			  struct diff_filespec *dst,
			  int contents_too)
99 100
{
	if (src->sha1_valid && dst->sha1_valid &&
101
	    !hashcmp(src->sha1, dst->sha1))
102
		return 1;
103 104
	if (!contents_too)
		return 0;
105 106 107 108
	if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
		return 0;
	if (src->size != dst->size)
		return 0;
109 110
	if (src->sha1_valid && dst->sha1_valid)
	    return !hashcmp(src->sha1, dst->sha1);
111
	if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
112 113 114 115 116 117 118
		return 0;
	if (src->size == dst->size &&
	    !memcmp(src->data, dst->data, src->size))
		return 1;
	return 0;
}

119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
{
	int src_len = strlen(src->path), dst_len = strlen(dst->path);
	while (src_len && dst_len) {
		char c1 = src->path[--src_len];
		char c2 = dst->path[--dst_len];
		if (c1 != c2)
			return 0;
		if (c1 == '/')
			return 1;
	}
	return (!src_len || src->path[src_len - 1] == '/') &&
		(!dst_len || dst->path[dst_len - 1] == '/');
}

134
struct diff_score {
135 136
	int src; /* index in rename_src */
	int dst; /* index in rename_dst */
137
	int score;
138
	int name_score;
139 140 141 142 143 144 145 146 147 148 149 150 151
};

static int estimate_similarity(struct diff_filespec *src,
			       struct diff_filespec *dst,
			       int minimum_score)
{
	/* src points at a file that existed in the original tree (or
	 * optionally a file in the destination tree) and dst points
	 * at a newly created file.  They may be quite similar, in which
	 * case we want to say src is renamed to dst or src is copied into
	 * dst, and then some edit has been applied to dst.
	 *
	 * Compare them and return how similar they are, representing
152 153 154 155 156
	 * the score as an integer between 0 and MAX_SCORE.
	 *
	 * When there is an exact match, it is considered a better
	 * match than anything else; the destination does not even
	 * call into this function in that case.
157
	 */
L
Linus Torvalds 已提交
158
	unsigned long max_size, delta_size, base_size, src_copied, literal_added;
159
	unsigned long delta_limit;
160 161
	int score;

162 163 164 165 166 167 168
	/* We deal only with regular files.  Symlink renames are handled
	 * only when they are exact matches --- in other words, no edits
	 * after renaming.
	 */
	if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
		return 0;

L
Linus Torvalds 已提交
169
	max_size = ((src->size > dst->size) ? src->size : dst->size);
170
	base_size = ((src->size < dst->size) ? src->size : dst->size);
L
Linus Torvalds 已提交
171
	delta_size = max_size - base_size;
172

173 174
	/* We would not consider edits that change the file size so
	 * drastically.  delta_size must be smaller than
175
	 * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
176
	 *
177 178 179
	 * Note that base_size == 0 case is handled here already
	 * and the final score computation below would not have a
	 * divide-by-zero issue.
180
	 */
181
	if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
182 183
		return 0;

J
Jeff King 已提交
184 185
	if ((!src->cnt_data && diff_populate_filespec(src, 0))
		|| (!dst->cnt_data && diff_populate_filespec(dst, 0)))
186 187
		return 0; /* error but caught downstream */

188

189 190
	delta_limit = (unsigned long)
		(base_size * (MAX_SCORE-minimum_score) / MAX_SCORE);
191
	if (diffcore_count_changes(src, dst,
192
				   &src->cnt_data, &dst->cnt_data,
193 194
				   delta_limit,
				   &src_copied, &literal_added))
195
		return 0;
196

197 198
	/* How similar are they?
	 * what percentage of material in dst are from source?
199
	 */
L
Linus Torvalds 已提交
200
	if (!dst->size)
201
		score = 0; /* should not happen */
202
	else
203
		score = (int)(src_copied * MAX_SCORE / max_size);
204 205 206
	return score;
}

J
Junio C Hamano 已提交
207
static void record_rename_pair(int dst_index, int src_index, int score)
208
{
209
	struct diff_filespec *src, *dst;
210
	struct diff_filepair *dp;
211

212 213
	if (rename_dst[dst_index].pair)
		die("internal error: dst already matched.");
214

215
	src = rename_src[src_index].one;
216
	src->rename_used++;
217
	src->count++;
218

219
	dst = rename_dst[dst_index].two;
220
	dst->count++;
221

222
	dp = diff_queue(NULL, src, dst);
223
	dp->renamed_pair = 1;
224 225 226 227
	if (!strcmp(src->path, dst->path))
		dp->score = rename_src[src_index].score;
	else
		dp->score = score;
228
	rename_dst[dst_index].pair = dp;
229 230 231 232
}

/*
 * We sort the rename similarity matrix with the score, in descending
233
 * order (the most similar first).
234 235 236 237
 */
static int score_compare(const void *a_, const void *b_)
{
	const struct diff_score *a = a_, *b = b_;
238 239 240 241

	if (a->score == b->score)
		return b->name_score - a->name_score;

242 243 244
	return b->score - a->score;
}

245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
/*
 * Find exact renames first.
 *
 * The first round matches up the up-to-date entries,
 * and then during the second round we try to match
 * cache-dirty entries as well.
 *
 * Note: the rest of the rename logic depends on this
 * phase also populating all the filespecs for any
 * entry that isn't matched up with an exact rename,
 * see "is_exact_match()".
 */
static int find_exact_renames(void)
{
	int rename_count = 0;
	int contents_too;

	for (contents_too = 0; contents_too < 2; contents_too++) {
		int i;

		for (i = 0; i < rename_dst_nr; i++) {
			struct diff_filespec *two = rename_dst[i].two;
			int j;

			if (rename_dst[i].pair)
				continue; /* dealt with an earlier round */
			for (j = 0; j < rename_src_nr; j++) {
				int k;
				struct diff_filespec *one = rename_src[j].one;
				if (!is_exact_match(one, two, contents_too))
					continue;

				/* see if there is a basename match, too */
				for (k = j; k < rename_src_nr; k++) {
					one = rename_src[k].one;
					if (basename_same(one, two) &&
						is_exact_match(one, two,
							contents_too)) {
						j = k;
						break;
					}
				}

				record_rename_pair(i, j, (int)MAX_SCORE);
				rename_count++;
				break; /* we are done with this entry */
			}
		}
	}
	return rename_count;
}

297
void diffcore_rename(struct diff_options *options)
298
{
299 300 301
	int detect_rename = options->detect_rename;
	int minimum_score = options->rename_score;
	int rename_limit = options->rename_limit;
302
	struct diff_queue_struct *q = &diff_queued_diff;
J
Junio C Hamano 已提交
303
	struct diff_queue_struct outq;
304
	struct diff_score *mx;
305
	int i, j, rename_count;
306
	int num_create, num_src, dst_cnt;
307

308
	if (!minimum_score)
309
		minimum_score = DEFAULT_RENAME_SCORE;
310 311

	for (i = 0; i < q->nr; i++) {
312
		struct diff_filepair *p = q->queue[i];
313
		if (!DIFF_FILE_VALID(p->one)) {
314
			if (!DIFF_FILE_VALID(p->two))
315
				continue; /* unmerged */
316 317 318
			else if (options->single_follow &&
				 strcmp(options->single_follow, p->two->path))
				continue; /* not interested */
319
			else
320
				locate_rename_dst(p->two, 1);
321
		}
322
		else if (!DIFF_FILE_VALID(p->two)) {
323 324
			/*
			 * If the source is a broken "delete", and
325 326
			 * they did not really want to get broken,
			 * that means the source actually stays.
327 328 329 330 331 332 333 334 335 336 337
			 * So we increment the "rename_used" score
			 * by one, to indicate ourselves as a user
			 */
			if (p->broken_pair && !p->score)
				p->one->rename_used++;
			register_rename_src(p->one, p->score);
		}
		else if (detect_rename == DIFF_DETECT_COPY) {
			/*
			 * Increment the "rename_used" score by
			 * one, to indicate ourselves as a user.
338
			 */
339 340
			p->one->rename_used++;
			register_rename_src(p->one, p->score);
341
		}
342
	}
343
	if (rename_dst_nr == 0 || rename_src_nr == 0)
344 345
		goto cleanup; /* nothing to do */

346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
	/*
	 * This basically does a test for the rename matrix not
	 * growing larger than a "rename_limit" square matrix, ie:
	 *
	 *    rename_dst_nr * rename_src_nr > rename_limit * rename_limit
	 *
	 * but handles the potential overflow case specially (and we
	 * assume at least 32-bit integers)
	 */
	if (rename_limit <= 0 || rename_limit > 32767)
		rename_limit = 32767;
	if (rename_dst_nr > rename_limit && rename_src_nr > rename_limit)
		goto cleanup;
	if (rename_dst_nr * rename_src_nr > rename_limit * rename_limit)
		goto cleanup;

362 363
	/*
	 * We really want to cull the candidates list early
364 365
	 * with cheap tests in order to avoid doing deltas.
	 */
366
	rename_count = find_exact_renames();
367 368 369 370

	/* Have we run out the created file pool?  If so we can avoid
	 * doing the delta matrix altogether.
	 */
J
Junio C Hamano 已提交
371
	if (rename_count == rename_dst_nr)
372
		goto cleanup;
373

374 375 376
	if (minimum_score == MAX_SCORE)
		goto cleanup;

J
Junio C Hamano 已提交
377
	num_create = (rename_dst_nr - rename_count);
378
	num_src = rename_src_nr;
379
	mx = xmalloc(sizeof(*mx) * num_create * num_src);
380
	for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
381
		int base = dst_cnt * num_src;
382 383
		struct diff_filespec *two = rename_dst[i].two;
		if (rename_dst[i].pair)
384
			continue; /* dealt with exact match already. */
385 386 387 388 389 390 391
		for (j = 0; j < rename_src_nr; j++) {
			struct diff_filespec *one = rename_src[j].one;
			struct diff_score *m = &mx[base+j];
			m->src = j;
			m->dst = i;
			m->score = estimate_similarity(one, two,
						       minimum_score);
392
			m->name_score = basename_same(one, two);
393
			diff_free_filespec_blob(one);
394
		}
395
		/* We do not need the text anymore */
396
		diff_free_filespec_blob(two);
397 398 399 400 401
		dst_cnt++;
	}
	/* cost matrix sorted by most to least similar pair */
	qsort(mx, num_create * num_src, sizeof(*mx), score_compare);
	for (i = 0; i < num_create * num_src; i++) {
402 403 404
		struct diff_rename_dst *dst = &rename_dst[mx[i].dst];
		if (dst->pair)
			continue; /* already done, either exact or fuzzy. */
405
		if (mx[i].score < minimum_score)
406
			break; /* there is no more usable pair. */
J
Junio C Hamano 已提交
407 408
		record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
		rename_count++;
409 410 411
	}
	free(mx);

412
 cleanup:
413
	/* At this point, we have found some renames and copies and they
J
Junio C Hamano 已提交
414
	 * are recorded in rename_dst.  The original list is still in *q.
415
	 */
416 417
	outq.queue = NULL;
	outq.nr = outq.alloc = 0;
418
	for (i = 0; i < q->nr; i++) {
419 420 421
		struct diff_filepair *p = q->queue[i];
		struct diff_filepair *pair_to_free = NULL;

422 423 424 425 426 427 428 429 430 431
		if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
			/*
			 * Creation
			 *
			 * We would output this create record if it has
			 * not been turned into a rename/copy already.
			 */
			struct diff_rename_dst *dst =
				locate_rename_dst(p->two, 0);
			if (dst && dst->pair) {
432 433 434 435
				diff_q(&outq, dst->pair);
				pair_to_free = p;
			}
			else
436 437
				/* no matching rename/copy source, so
				 * record this as a creation.
438 439
				 */
				diff_q(&outq, p);
440
		}
441 442 443 444
		else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
			/*
			 * Deletion
			 *
445 446 447 448
			 * We would output this delete record if:
			 *
			 * (1) this is a broken delete and the counterpart
			 *     broken create remains in the output; or
J
Junio C Hamano 已提交
449 450 451
			 * (2) this is not a broken delete, and rename_dst
			 *     does not have a rename/copy to move p->one->path
			 *     out of existence.
452 453 454 455 456
			 *
			 * Otherwise, the counterpart broken create
			 * has been turned into a rename-edit; or
			 * delete did not have a matching create to
			 * begin with.
457
			 */
458 459 460 461 462 463 464 465 466
			if (DIFF_PAIR_BROKEN(p)) {
				/* broken delete */
				struct diff_rename_dst *dst =
					locate_rename_dst(p->one, 0);
				if (dst && dst->pair)
					/* counterpart is now rename/copy */
					pair_to_free = p;
			}
			else {
467
				if (p->one->rename_used)
468 469 470
					/* this path remains */
					pair_to_free = p;
			}
471 472 473 474 475 476

			if (pair_to_free)
				;
			else
				diff_q(&outq, p);
		}
477
		else if (!diff_unmodified_pair(p))
478
			/* all the usual ones need to be kept */
479
			diff_q(&outq, p);
480 481 482 483
		else
			/* no need to keep unmodified pairs */
			pair_to_free = p;

484 485
		if (pair_to_free)
			diff_free_filepair(pair_to_free);
486
	}
487
	diff_debug_queue("done copying original", &outq);
488

489 490 491
	free(q->queue);
	*q = outq;
	diff_debug_queue("done collapsing", q);
492

493 494
	for (i = 0; i < rename_dst_nr; i++)
		free_filespec(rename_dst[i].two);
J
Junio C Hamano 已提交
495

496 497 498 499 500 501
	free(rename_dst);
	rename_dst = NULL;
	rename_dst_nr = rename_dst_alloc = 0;
	free(rename_src);
	rename_src = NULL;
	rename_src_nr = rename_src_alloc = 0;
502 503
	return;
}