pack-objects.c 33.4 KB
Newer Older
1 2
#include "cache.h"
#include "object.h"
3 4 5 6
#include "blob.h"
#include "commit.h"
#include "tag.h"
#include "tree.h"
7
#include "delta.h"
8
#include "pack.h"
9
#include "csum-file.h"
J
Junio C Hamano 已提交
10
#include "tree-walk.h"
11
#include <sys/time.h>
12
#include <signal.h>
13

14
static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list";
15 16 17

struct object_entry {
	unsigned char sha1[20];
18
	unsigned long size;	/* uncompressed size */
19 20 21
	unsigned long offset;	/* offset into the final pack file;
				 * nonzero if already written.
				 */
22
	unsigned int depth;	/* delta depth */
23
	unsigned int delta_limit;	/* base adjustment for in-pack delta */
24
	unsigned int hash;	/* name hint hash */
25
	enum object_type type;
26
	enum object_type in_pack_type;	/* could be delta */
27 28 29 30
	unsigned long delta_size;	/* delta data size (uncompressed) */
	struct object_entry *delta;	/* delta base object */
	struct packed_git *in_pack; 	/* already in pack */
	unsigned int in_pack_offset;
31 32 33 34
	struct object_entry *delta_child; /* delitified objects who bases me */
	struct object_entry *delta_sibling; /* other deltified objects who
					     * uses the same base as me
					     */
35 36 37 38
	int preferred_base;	/* we do not pack this, but is encouraged to
				 * be used as the base objectto delta huge
				 * objects against.
				 */
39 40
};

41 42 43 44 45 46 47 48 49 50 51 52 53
/*
 * Objects we are going to pack are colected in objects array (dynamically
 * expanded).  nr_objects & nr_alloc controls this array.  They are stored
 * in the order we see -- typically rev-list --objects order that gives us
 * nice "minimum seek" order.
 *
 * sorted-by-sha ans sorted-by-type are arrays of pointers that point at
 * elements in the objects array.  The former is used to build the pack
 * index (lists object names in the ascending order to help offset lookup),
 * and the latter is used to group similar things together by try_delta()
 * heuristics.
 */

54
static unsigned char object_list_sha1[20];
55
static int non_empty = 0;
56
static int no_reuse_delta = 0;
L
Linus Torvalds 已提交
57
static int local = 0;
58
static int incremental = 0;
59 60
static struct object_entry **sorted_by_sha, **sorted_by_type;
static struct object_entry *objects = NULL;
61
static int nr_objects = 0, nr_alloc = 0, nr_result = 0;
62
static const char *base_name;
63
static unsigned char pack_file_sha1[20];
J
Junio C Hamano 已提交
64
static int progress = 1;
65
static volatile sig_atomic_t progress_update = 0;
66

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
/*
 * The object names in objects array are hashed with this hashtable,
 * to help looking up the entry by object name.  Binary search from
 * sorted_by_sha is also possible but this was easier to code and faster.
 * This hashtable is built after all the objects are seen.
 */
static int *object_ix = NULL;
static int object_ix_hashsz = 0;

/*
 * Pack index for existing packs give us easy access to the offsets into
 * corresponding pack file where each object's data starts, but the entries
 * do not store the size of the compressed representation (uncompressed
 * size is easily available by examining the pack entry header).  We build
 * a hashtable of existing packs (pack_revindex), and keep reverse index
 * here -- pack index file is sorted by object name mapping to offset; this
 * pack_revindex[].revindex array is an ordered list of offsets, so if you
 * know the offset of an object, next offset is where its packed
 * representation ends.
 */
struct pack_revindex {
	struct packed_git *p;
	unsigned long *revindex;
} *pack_revindex = NULL;
static int pack_revindex_hashsz = 0;

/*
 * stats
 */
static int written = 0;
97
static int written_delta = 0;
98
static int reused = 0;
99
static int reused_delta = 0;
100 101 102

static int pack_revindex_ix(struct packed_git *p)
{
L
Luck, Tony 已提交
103
	unsigned long ui = (unsigned long)p;
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
	int i;

	ui = ui ^ (ui >> 16); /* defeat structure alignment */
	i = (int)(ui % pack_revindex_hashsz);
	while (pack_revindex[i].p) {
		if (pack_revindex[i].p == p)
			return i;
		if (++i == pack_revindex_hashsz)
			i = 0;
	}
	return -1 - i;
}

static void prepare_pack_ix(void)
{
	int num;
	struct packed_git *p;
	for (num = 0, p = packed_git; p; p = p->next)
		num++;
	if (!num)
		return;
	pack_revindex_hashsz = num * 11;
	pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz);
	for (p = packed_git; p; p = p->next) {
		num = pack_revindex_ix(p);
		num = - 1 - num;
		pack_revindex[num].p = p;
	}
	/* revindex elements are lazily initialized */
}

static int cmp_offset(const void *a_, const void *b_)
{
	unsigned long a = *(unsigned long *) a_;
	unsigned long b = *(unsigned long *) b_;
	if (a < b)
		return -1;
	else if (a == b)
		return 0;
	else
		return 1;
}

/*
 * Ordered list of offsets of objects in the pack.
 */
static void prepare_pack_revindex(struct pack_revindex *rix)
{
	struct packed_git *p = rix->p;
	int num_ent = num_packed_objects(p);
	int i;
	void *index = p->index_base + 256;

	rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
	for (i = 0; i < num_ent; i++) {
159
		uint32_t hl = *((uint32_t *)(index + 24 * i));
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
		rix->revindex[i] = ntohl(hl);
	}
	/* This knows the pack format -- the 20-byte trailer
	 * follows immediately after the last object data.
	 */
	rix->revindex[num_ent] = p->pack_size - 20;
	qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
}

static unsigned long find_packed_object_size(struct packed_git *p,
					     unsigned long ofs)
{
	int num;
	int lo, hi;
	struct pack_revindex *rix;
	unsigned long *revindex;
	num = pack_revindex_ix(p);
	if (num < 0)
		die("internal error: pack revindex uninitialized");
	rix = &pack_revindex[num];
	if (!rix->revindex)
		prepare_pack_revindex(rix);
	revindex = rix->revindex;
	lo = 0;
	hi = num_packed_objects(p) + 1;
	do {
		int mi = (lo + hi) / 2;
		if (revindex[mi] == ofs) {
			return revindex[mi+1] - ofs;
		}
		else if (ofs < revindex[mi])
			hi = mi;
		else
			lo = mi + 1;
	} while (lo < hi);
	die("internal error: pack revindex corrupt");
}

198 199 200 201 202 203 204 205 206
static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
	unsigned long othersize, delta_size;
	char type[10];
	void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
	void *delta_buf;

	if (!otherbuf)
		die("unable to read %s", sha1_to_hex(entry->delta->sha1));
207
        delta_buf = diff_delta(otherbuf, othersize,
208
			       buf, size, &delta_size, 0);
209
        if (!delta_buf || delta_size != entry->delta_size)
210 211 212 213 214 215
        	die("delta size changed");
        free(buf);
        free(otherbuf);
	return delta_buf;
}

216 217 218 219 220 221 222 223 224
/*
 * The per-object header is a pretty dense thing, which is
 *  - first byte: low four bits are "size", then three bits of "type",
 *    and the high bit is "size continues".
 *  - each byte afterwards: low seven bits are size continuation,
 *    with the high bit being "size continues"
 */
static int encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
{
225
	int n = 1;
226 227 228 229 230
	unsigned char c;

	if (type < OBJ_COMMIT || type > OBJ_DELTA)
		die("bad type %d", type);

231 232 233
	c = (type << 4) | (size & 15);
	size >>= 4;
	while (size) {
234
		*hdr++ = c | 0x80;
235 236 237
		c = size & 0x7f;
		size >>= 7;
		n++;
238 239 240 241 242
	}
	*hdr = c;
	return n;
}

243 244
static unsigned long write_object(struct sha1file *f,
				  struct object_entry *entry)
245 246 247
{
	unsigned long size;
	char type[10];
248
	void *buf;
249
	unsigned char header[10];
250
	unsigned hdrlen, datalen;
251
	enum object_type obj_type;
252
	int to_reuse = 0;
253

254 255
	if (entry->preferred_base)
		return 0;
256

257
	obj_type = entry->type;
258 259 260 261 262 263 264 265 266 267 268 269 270 271
	if (! entry->in_pack)
		to_reuse = 0;	/* can't reuse what we don't have */
	else if (obj_type == OBJ_DELTA)
		to_reuse = 1;	/* check_object() decided it for us */
	else if (obj_type != entry->in_pack_type)
		to_reuse = 0;	/* pack has delta which is unusable */
	else if (entry->delta)
		to_reuse = 0;	/* we want to pack afresh */
	else
		to_reuse = 1;	/* we have it in-pack undeltified,
				 * and we do not need to deltify it.
				 */

	if (! to_reuse) {
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
		buf = read_sha1_file(entry->sha1, type, &size);
		if (!buf)
			die("unable to read %s", sha1_to_hex(entry->sha1));
		if (size != entry->size)
			die("object %s size inconsistency (%lu vs %lu)",
			    sha1_to_hex(entry->sha1), size, entry->size);
		if (entry->delta) {
			buf = delta_against(buf, size, entry);
			size = entry->delta_size;
			obj_type = OBJ_DELTA;
		}
		/*
		 * The object header is a byte of 'type' followed by zero or
		 * more bytes of length.  For deltas, the 20 bytes of delta
		 * sha1 follows that.
		 */
		hdrlen = encode_header(obj_type, size, header);
		sha1write(f, header, hdrlen);

		if (entry->delta) {
			sha1write(f, entry->delta, 20);
			hdrlen += 20;
		}
		datalen = sha1write_compressed(f, buf, size);
		free(buf);
297
	}
298 299 300 301 302 303 304 305 306
	else {
		struct packed_git *p = entry->in_pack;
		use_packed_git(p);

		datalen = find_packed_object_size(p, entry->in_pack_offset);
		buf = p->pack_base + entry->in_pack_offset;
		sha1write(f, buf, datalen);
		unuse_packed_git(p);
		hdrlen = 0; /* not really */
307 308
		if (obj_type == OBJ_DELTA)
			reused_delta++;
309
		reused++;
310
	}
311 312
	if (obj_type == OBJ_DELTA)
		written_delta++;
313
	written++;
314 315 316
	return hdrlen + datalen;
}

317 318 319 320 321 322 323 324 325 326 327
static unsigned long write_one(struct sha1file *f,
			       struct object_entry *e,
			       unsigned long offset)
{
	if (e->offset)
		/* offset starts from header size and cannot be zero
		 * if it is written already.
		 */
		return offset;
	e->offset = offset;
	offset += write_object(f, e);
J
Junio C Hamano 已提交
328
	/* if we are deltified, write out its base object. */
329 330 331 332 333
	if (e->delta)
		offset = write_one(f, e->delta, offset);
	return offset;
}

334 335 336
static void write_pack_file(void)
{
	int i;
337
	struct sha1file *f;
338 339
	unsigned long offset;
	struct pack_header hdr;
340 341
	unsigned last_percent = 999;
	int do_progress = 0;
342

343 344
	if (!base_name)
		f = sha1fd(1, "<stdout>");
345 346 347 348 349 350
	else {
		f = sha1create("%s-%s.%s", base_name,
			       sha1_to_hex(object_list_sha1), "pack");
		do_progress = progress;
	}
	if (do_progress)
351
		fprintf(stderr, "Writing %d objects.\n", nr_result);
352

353
	hdr.hdr_signature = htonl(PACK_SIGNATURE);
354
	hdr.hdr_version = htonl(PACK_VERSION);
355
	hdr.hdr_entries = htonl(nr_result);
356 357
	sha1write(f, &hdr, sizeof(hdr));
	offset = sizeof(hdr);
358 359
	if (!nr_result)
		goto done;
360
	for (i = 0; i < nr_objects; i++) {
361
		offset = write_one(f, objects + i, offset);
362
		if (do_progress) {
363
			unsigned percent = written * 100 / nr_result;
364 365
			if (progress_update || percent != last_percent) {
				fprintf(stderr, "%4u%% (%u/%u) done\r",
366
					percent, written, nr_result);
367 368 369
				progress_update = 0;
				last_percent = percent;
			}
370 371
		}
	}
372 373
	if (do_progress)
		fputc('\n', stderr);
374
 done:
375
	sha1close(f, pack_file_sha1, 1);
376 377 378 379 380
}

static void write_index_file(void)
{
	int i;
381 382
	struct sha1file *f = sha1create("%s-%s.%s", base_name,
					sha1_to_hex(object_list_sha1), "idx");
383
	struct object_entry **list = sorted_by_sha;
384
	struct object_entry **last = list + nr_result;
385 386 387 388 389
	unsigned int array[256];

	/*
	 * Write the first-level table (the list is sorted,
	 * but we use a 256-entry lookup to be able to avoid
390
	 * having to do eight extra binary search iterations).
391 392 393 394 395 396 397 398 399 400 401 402
	 */
	for (i = 0; i < 256; i++) {
		struct object_entry **next = list;
		while (next < last) {
			struct object_entry *entry = *next;
			if (entry->sha1[0] != i)
				break;
			next++;
		}
		array[i] = htonl(next - sorted_by_sha);
		list = next;
	}
403
	sha1write(f, array, 256 * sizeof(int));
404 405 406 407 408

	/*
	 * Write the actual SHA1 entries..
	 */
	list = sorted_by_sha;
409
	for (i = 0; i < nr_result; i++) {
410 411
		struct object_entry *entry = *list++;
		unsigned int offset = htonl(entry->offset);
412 413
		sha1write(f, &offset, 4);
		sha1write(f, entry->sha1, 20);
414
	}
415 416
	sha1write(f, pack_file_sha1, 20);
	sha1close(f, NULL, 1);
417 418
}

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
static int locate_object_entry_hash(const unsigned char *sha1)
{
	int i;
	unsigned int ui;
	memcpy(&ui, sha1, sizeof(unsigned int));
	i = ui % object_ix_hashsz;
	while (0 < object_ix[i]) {
		if (!memcmp(sha1, objects[object_ix[i]-1].sha1, 20))
			return i;
		if (++i == object_ix_hashsz)
			i = 0;
	}
	return -1 - i;
}

static struct object_entry *locate_object_entry(const unsigned char *sha1)
{
	int i;

	if (!object_ix_hashsz)
		return NULL;

	i = locate_object_entry_hash(sha1);
	if (0 <= i)
		return &objects[object_ix[i]-1];
	return NULL;
}

static void rehash_objects(void)
448
{
449 450 451 452 453 454 455
	int i;
	struct object_entry *oe;

	object_ix_hashsz = nr_objects * 3;
	if (object_ix_hashsz < 1024)
		object_ix_hashsz = 1024;
	object_ix = xrealloc(object_ix, sizeof(int) * object_ix_hashsz);
456
	memset(object_ix, 0, sizeof(int) * object_ix_hashsz);
457 458 459 460 461 462 463 464 465
	for (i = 0, oe = objects; i < nr_objects; i++, oe++) {
		int ix = locate_object_entry_hash(oe->sha1);
		if (0 <= ix)
			continue;
		ix = -1 - ix;
		object_ix[ix] = i + 1;
	}
}

466 467 468 469 470 471
struct name_path {
	struct name_path *up;
	const char *elem;
	int len;
};

472 473
#define DIRBITS 12

474 475 476 477
static unsigned name_hash(struct name_path *path, const char *name)
{
	struct name_path *p = path;
	const char *n = name + strlen(name);
478
	unsigned hash = 0, name_hash = 0, name_done = 0;
479 480 481 482 483

	if (n != name && n[-1] == '\n')
		n--;
	while (name <= --n) {
		unsigned char c = *n;
484 485 486 487 488
		if (c == '/' && !name_done) {
			name_hash = hash;
			name_done = 1;
			hash = 0;
		}
489 490
		hash = hash * 11 + c;
	}
491 492 493 494
	if (!name_done) {
		name_hash = hash;
		hash = 0;
	}
495 496 497 498 499 500 501 502
	for (p = path; p; p = p->up) {
		hash = hash * 11 + '/';
		n = p->elem + p->len;
		while (p->elem <= --n) {
			unsigned char c = *n;
			hash = hash * 11 + c;
		}
	}
503 504 505 506 507
	/*
	 * Make sure "Makefile" and "t/Makefile" are hashed separately
	 * but close enough.
	 */
	hash = (name_hash<<DIRBITS) | (hash & ((1U<<DIRBITS )-1));
508 509 510 511
	return hash;
}

static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude)
512 513 514
{
	unsigned int idx = nr_objects;
	struct object_entry *entry;
515
	struct packed_git *p;
516 517
	unsigned int found_offset = 0;
	struct packed_git *found_pack = NULL;
518
	int ix, status = 0;
519

520
	if (!exclude) {
L
Linus Torvalds 已提交
521 522 523 524 525 526 527
		for (p = packed_git; p; p = p->next) {
			struct pack_entry e;
			if (find_pack_entry_one(sha1, &e, p)) {
				if (incremental)
					return 0;
				if (local && !p->pack_local)
					return 0;
528 529 530 531
				if (!found_pack) {
					found_offset = e.offset;
					found_pack = e.p;
				}
L
Linus Torvalds 已提交
532 533 534
			}
		}
	}
535 536
	if ((entry = locate_object_entry(sha1)) != NULL)
		goto already_added;
537

538 539 540 541 542 543
	if (idx >= nr_alloc) {
		unsigned int needed = (idx + 1024) * 3 / 2;
		objects = xrealloc(objects, needed * sizeof(*entry));
		nr_alloc = needed;
	}
	entry = objects + idx;
544
	nr_objects = idx + 1;
545 546
	memset(entry, 0, sizeof(*entry));
	memcpy(entry->sha1, sha1, 20);
547
	entry->hash = hash;
548 549 550 551 552 553 554 555

	if (object_ix_hashsz * 3 <= nr_objects * 4)
		rehash_objects();
	else {
		ix = locate_object_entry_hash(entry->sha1);
		if (0 <= ix)
			die("internal error in object hashing.");
		object_ix[-1 - ix] = idx + 1;
556
	}
557
	status = 1;
558 559

 already_added:
560 561 562 563
	if (progress_update) {
		fprintf(stderr, "Counting objects...%d\r", nr_objects);
		progress_update = 0;
	}
564 565 566 567 568 569 570
	if (exclude)
		entry->preferred_base = 1;
	else {
		if (found_pack) {
			entry->in_pack = found_pack;
			entry->in_pack_offset = found_offset;
		}
571
	}
572
	return status;
573 574
}

575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
struct pbase_tree_cache {
	unsigned char sha1[20];
	int ref;
	int temporary;
	void *tree_data;
	unsigned long tree_size;
};

static struct pbase_tree_cache *(pbase_tree_cache[256]);
static int pbase_tree_cache_ix(const unsigned char *sha1)
{
	return sha1[0] % ARRAY_SIZE(pbase_tree_cache);
}
static int pbase_tree_cache_ix_incr(int ix)
{
	return (ix+1) % ARRAY_SIZE(pbase_tree_cache);
}

static struct pbase_tree {
	struct pbase_tree *next;
	/* This is a phony "cache" entry; we are not
	 * going to evict it nor find it through _get()
	 * mechanism -- this is for the toplevel node that
	 * would almost always change with any commit.
	 */
	struct pbase_tree_cache pcache;
} *pbase_tree;

static struct pbase_tree_cache *pbase_tree_get(const unsigned char *sha1)
{
	struct pbase_tree_cache *ent, *nent;
	void *data;
	unsigned long size;
	char type[20];
	int neigh;
	int my_ix = pbase_tree_cache_ix(sha1);
	int available_ix = -1;

	/* pbase-tree-cache acts as a limited hashtable.
	 * your object will be found at your index or within a few
	 * slots after that slot if it is cached.
	 */
	for (neigh = 0; neigh < 8; neigh++) {
		ent = pbase_tree_cache[my_ix];
		if (ent && !memcmp(ent->sha1, sha1, 20)) {
			ent->ref++;
			return ent;
		}
		else if (((available_ix < 0) && (!ent || !ent->ref)) ||
			 ((0 <= available_ix) &&
			  (!ent && pbase_tree_cache[available_ix])))
			available_ix = my_ix;
		if (!ent)
			break;
		my_ix = pbase_tree_cache_ix_incr(my_ix);
	}

	/* Did not find one.  Either we got a bogus request or
	 * we need to read and perhaps cache.
	 */
	data = read_sha1_file(sha1, type, &size);
	if (!data)
		return NULL;
	if (strcmp(type, tree_type)) {
		free(data);
		return NULL;
	}

	/* We need to either cache or return a throwaway copy */

	if (available_ix < 0)
		ent = NULL;
	else {
		ent = pbase_tree_cache[available_ix];
		my_ix = available_ix;
	}

	if (!ent) {
		nent = xmalloc(sizeof(*nent));
		nent->temporary = (available_ix < 0);
	}
	else {
		/* evict and reuse */
		free(ent->tree_data);
		nent = ent;
	}
	memcpy(nent->sha1, sha1, 20);
	nent->tree_data = data;
	nent->tree_size = size;
	nent->ref = 1;
	if (!nent->temporary)
		pbase_tree_cache[my_ix] = nent;
	return nent;
}

static void pbase_tree_put(struct pbase_tree_cache *cache)
{
	if (!cache->temporary) {
		cache->ref--;
		return;
	}
	free(cache->tree_data);
	free(cache);
}

static int name_cmp_len(const char *name)
{
	int i;
	for (i = 0; name[i] && name[i] != '\n' && name[i] != '/'; i++)
		;
	return i;
}

static void add_pbase_object(struct tree_desc *tree,
			     struct name_path *up,
			     const char *name,
			     int cmplen)
692
{
693 694
	while (tree->size) {
		const unsigned char *sha1;
695 696 697
		const char *entry_name;
		int entry_len;
		unsigned mode;
698 699 700
		unsigned long size;
		char type[20];

701
		sha1 = tree_entry_extract(tree, &entry_name, &mode);
702
		update_tree_entry(tree);
703 704 705 706 707
		entry_len = strlen(entry_name);
		if (entry_len != cmplen ||
		    memcmp(entry_name, name, cmplen) ||
		    !has_sha1_file(sha1) ||
		    sha1_object_info(sha1, type, &size))
708
			continue;
709 710 711 712 713
		if (name[cmplen] != '/') {
			unsigned hash = name_hash(up, name);
			add_object_entry(sha1, hash, 1);
			return;
		}
714
		if (!strcmp(type, tree_type)) {
715
			struct tree_desc sub;
716
			struct name_path me;
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
			struct pbase_tree_cache *tree;
			const char *down = name+cmplen+1;
			int downlen = name_cmp_len(down);

			tree = pbase_tree_get(sha1);
			if (!tree)
				return;
			sub.buf = tree->tree_data;
			sub.size = tree->tree_size;

			me.up = up;
			me.elem = entry_name;
			me.len = entry_len;
			add_pbase_object(&sub, &me, down, downlen);
			pbase_tree_put(tree);
		}
	}
}
735

736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
static unsigned *done_pbase_paths;
static int done_pbase_paths_num;
static int done_pbase_paths_alloc;
static int done_pbase_path_pos(unsigned hash)
{
	int lo = 0;
	int hi = done_pbase_paths_num;
	while (lo < hi) {
		int mi = (hi + lo) / 2;
		if (done_pbase_paths[mi] == hash)
			return mi;
		if (done_pbase_paths[mi] < hash)
			hi = mi;
		else
			lo = mi + 1;
	}
	return -lo-1;
}

static int check_pbase_path(unsigned hash)
{
	int pos = (!done_pbase_paths) ? -1 : done_pbase_path_pos(hash);
	if (0 <= pos)
		return 1;
	pos = -pos - 1;
	if (done_pbase_paths_alloc <= done_pbase_paths_num) {
		done_pbase_paths_alloc = alloc_nr(done_pbase_paths_alloc);
		done_pbase_paths = xrealloc(done_pbase_paths,
					    done_pbase_paths_alloc *
					    sizeof(unsigned));
	}
	done_pbase_paths_num++;
	if (pos < done_pbase_paths_num)
		memmove(done_pbase_paths + pos + 1,
			done_pbase_paths + pos,
			(done_pbase_paths_num - pos - 1) * sizeof(unsigned));
	done_pbase_paths[pos] = hash;
	return 0;
}

static void add_preferred_base_object(char *name, unsigned hash)
{
	struct pbase_tree *it;
	int cmplen = name_cmp_len(name);

	if (check_pbase_path(hash))
		return;

	for (it = pbase_tree; it; it = it->next) {
		if (cmplen == 0) {
			hash = name_hash(NULL, "");
			add_object_entry(it->pcache.sha1, hash, 1);
		}
		else {
			struct tree_desc tree;
			tree.buf = it->pcache.tree_data;
			tree.size = it->pcache.tree_size;
			add_pbase_object(&tree, NULL, name, cmplen);
794
		}
795 796 797
	}
}

798
static void add_preferred_base(unsigned char *sha1)
799
{
800 801 802 803
	struct pbase_tree *it;
	void *data;
	unsigned long size;
	unsigned char tree_sha1[20];
804

805 806
	data = read_object_with_reference(sha1, tree_type, &size, tree_sha1);
	if (!data)
807
		return;
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822

	for (it = pbase_tree; it; it = it->next) {
		if (!memcmp(it->pcache.sha1, tree_sha1, 20)) {
			free(data);
			return;
		}
	}

	it = xcalloc(1, sizeof(*it));
	it->next = pbase_tree;
	pbase_tree = it;

	memcpy(it->pcache.sha1, tree_sha1, 20);
	it->pcache.tree_data = data;
	it->pcache.tree_size = size;
823 824
}

825 826
static void check_object(struct object_entry *entry)
{
827 828
	char type[20];

829
	if (entry->in_pack && !entry->preferred_base) {
830 831 832 833 834 835 836 837 838 839 840 841
		unsigned char base[20];
		unsigned long size;
		struct object_entry *base_entry;

		/* We want in_pack_type even if we do not reuse delta.
		 * There is no point not reusing non-delta representations.
		 */
		check_reuse_pack_delta(entry->in_pack,
				       entry->in_pack_offset,
				       base, &size,
				       &entry->in_pack_type);

842 843 844 845
		/* Check if it is delta, and the base is also an object
		 * we are going to pack.  If so we will reuse the existing
		 * delta.
		 */
846 847
		if (!no_reuse_delta &&
		    entry->in_pack_type == OBJ_DELTA &&
848 849
		    (base_entry = locate_object_entry(base)) &&
		    (!base_entry->preferred_base)) {
850 851 852 853 854

			/* Depth value does not matter - find_deltas()
			 * will never consider reused delta as the
			 * base object to deltify other objects
			 * against, in order to avoid circular deltas.
855
			 */
856 857

			/* uncompressed size of the delta data */
858 859 860
			entry->size = entry->delta_size = size;
			entry->delta = base_entry;
			entry->type = OBJ_DELTA;
861

862 863
			entry->delta_sibling = base_entry->delta_child;
			base_entry->delta_child = entry;
864

865 866 867
			return;
		}
		/* Otherwise we would do the usual */
868
	}
869 870

	if (sha1_object_info(entry->sha1, type, &entry->size))
871 872
		die("unable to get type of object %s",
		    sha1_to_hex(entry->sha1));
873

874
	if (!strcmp(type, commit_type)) {
875
		entry->type = OBJ_COMMIT;
876
	} else if (!strcmp(type, tree_type)) {
877
		entry->type = OBJ_TREE;
878
	} else if (!strcmp(type, blob_type)) {
879
		entry->type = OBJ_BLOB;
880
	} else if (!strcmp(type, tag_type)) {
881 882 883 884 885 886
		entry->type = OBJ_TAG;
	} else
		die("unable to pack object %s of type %s",
		    sha1_to_hex(entry->sha1), type);
}

887 888 889 890 891 892 893 894 895 896 897 898 899
static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
{
	struct object_entry *child = me->delta_child;
	unsigned int m = n;
	while (child) {
		unsigned int c = check_delta_limit(child, n + 1);
		if (m < c)
			m = c;
		child = child->delta_sibling;
	}
	return m;
}

900 901 902
static void get_object_details(void)
{
	int i;
903
	struct object_entry *entry;
904

905
	prepare_pack_ix();
906 907
	for (i = 0, entry = objects; i < nr_objects; i++, entry++)
		check_object(entry);
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924

	if (nr_objects == nr_result) {
		/*
		 * Depth of objects that depend on the entry -- this
		 * is subtracted from depth-max to break too deep
		 * delta chain because of delta data reusing.
		 * However, we loosen this restriction when we know we
		 * are creating a thin pack -- it will have to be
		 * expanded on the other end anyway, so do not
		 * artificially cut the delta chain and let it go as
		 * deep as it wants.
		 */
		for (i = 0, entry = objects; i < nr_objects; i++, entry++)
			if (!entry->delta && entry->delta_child)
				entry->delta_limit =
					check_delta_limit(entry, 1);
	}
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
}

typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);

static entry_sort_t current_sort;

static int sort_comparator(const void *_a, const void *_b)
{
	struct object_entry *a = *(struct object_entry **)_a;
	struct object_entry *b = *(struct object_entry **)_b;
	return current_sort(a,b);
}

static struct object_entry **create_sorted_list(entry_sort_t sort)
{
	struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
	int i;

	for (i = 0; i < nr_objects; i++)
		list[i] = objects + i;
	current_sort = sort;
	qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
	return list;
}

static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
{
	return memcmp(a->sha1, b->sha1, 20);
}

T
Timo Hirvonen 已提交
955
static struct object_entry **create_final_object_list(void)
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
{
	struct object_entry **list;
	int i, j;

	for (i = nr_result = 0; i < nr_objects; i++)
		if (!objects[i].preferred_base)
			nr_result++;
	list = xmalloc(nr_result * sizeof(struct object_entry *));
	for (i = j = 0; i < nr_objects; i++) {
		if (!objects[i].preferred_base)
			list[j++] = objects + i;
	}
	current_sort = sha1_sort;
	qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator);
	return list;
}

973 974 975 976 977 978
static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
{
	if (a->type < b->type)
		return -1;
	if (a->type > b->type)
		return 1;
979 980 981 982
	if (a->hash < b->hash)
		return -1;
	if (a->hash > b->hash)
		return 1;
983 984 985 986
	if (a->preferred_base < b->preferred_base)
		return -1;
	if (a->preferred_base > b->preferred_base)
		return 1;
987 988 989 990 991 992 993 994 995 996
	if (a->size < b->size)
		return -1;
	if (a->size > b->size)
		return 1;
	return a < b ? -1 : (a > b);
}

struct unpacked {
	struct object_entry *entry;
	void *data;
997
	struct delta_index *index;
998 999 1000
};

/*
1001 1002 1003 1004 1005 1006
 * We search for deltas _backwards_ in a list sorted by type and
 * by size, so that we see progressively smaller and smaller files.
 * That's because we prefer deltas to be from the bigger file
 * to the smaller - deletes are potentially cheaper, but perhaps
 * more importantly, the bigger file is likely the more recent
 * one.
1007
 */
1008 1009
static int try_delta(struct unpacked *trg, struct unpacked *src,
		     struct delta_index *src_index, unsigned max_depth)
1010
{
1011 1012 1013
	struct object_entry *trg_entry = trg->entry;
	struct object_entry *src_entry = src->entry;
	unsigned long size, src_size, delta_size, sizediff, max_size;
1014 1015 1016
	void *delta_buf;

	/* Don't bother doing diffs between different types */
1017
	if (trg_entry->type != src_entry->type)
1018 1019
		return -1;

1020 1021 1022
	/* We do not compute delta to *create* objects we are not
	 * going to pack.
	 */
1023
	if (trg_entry->preferred_base)
1024
		return -1;
1025

1026 1027
	/*
	 * If the current object is at pack edge, take the depth the
1028 1029
	 * objects that depend on the current object into account --
	 * otherwise they would become too deep.
1030
	 */
1031 1032
	if (trg_entry->delta_child) {
		if (max_depth <= trg_entry->delta_limit)
1033
			return 0;
1034
		max_depth -= trg_entry->delta_limit;
1035
	}
1036
	if (src_entry->depth >= max_depth)
1037
		return 0;
1038

1039 1040
	/* Now some size filtering euristics. */
	size = trg_entry->size;
1041
	max_size = size / 2 - 20;
1042 1043 1044 1045
	if (trg_entry->delta)
		max_size = trg_entry->delta_size-1;
	src_size = src_entry->size;
	sizediff = src_size < size ? size - src_size : 0;
1046
	if (sizediff >= max_size)
1047
		return 0;
1048 1049

	delta_buf = create_delta(src_index, trg->data, size, &delta_size, max_size);
1050
	if (!delta_buf)
1051
		return 0;
1052 1053 1054 1055

	trg_entry->delta = src_entry;
	trg_entry->delta_size = delta_size;
	trg_entry->depth = src_entry->depth + 1;
1056
	free(delta_buf);
1057
	return 1;
1058 1059
}

1060 1061 1062 1063 1064
static void progress_interval(int signum)
{
	progress_update = 1;
}

1065
static void find_deltas(struct object_entry **list, int window, int depth)
1066
{
1067
	int i, idx;
1068 1069
	unsigned int array_size = window * sizeof(struct unpacked);
	struct unpacked *array = xmalloc(array_size);
1070 1071
	unsigned processed = 0;
	unsigned last_percent = 999;
1072 1073

	memset(array, 0, array_size);
1074 1075
	i = nr_objects;
	idx = 0;
1076
	if (progress)
1077
		fprintf(stderr, "Deltifying %d objects.\n", nr_result);
1078

1079
	while (--i >= 0) {
1080 1081 1082 1083 1084 1085
		struct object_entry *entry = list[i];
		struct unpacked *n = array + idx;
		unsigned long size;
		char type[10];
		int j;

1086 1087 1088
		if (!entry->preferred_base)
			processed++;

1089
		if (progress) {
1090
			unsigned percent = processed * 100 / nr_result;
1091 1092
			if (percent != last_percent || progress_update) {
				fprintf(stderr, "%4u%% (%u/%u) done\r",
1093
					percent, processed, nr_result);
1094 1095 1096
				progress_update = 0;
				last_percent = percent;
			}
1097
		}
1098 1099 1100

		if (entry->delta)
			/* This happens if we decided to reuse existing
1101
			 * delta from a pack.  "!no_reuse_delta &&" is implied.
1102 1103 1104
			 */
			continue;

1105 1106
		if (entry->size < 50)
			continue;
1107 1108
		if (n->index)
			free_delta_index(n->index);
1109 1110 1111 1112
		free(n->data);
		n->entry = entry;
		n->data = read_sha1_file(entry->sha1, type, &size);
		if (size != entry->size)
1113 1114 1115 1116 1117
			die("object %s inconsistent object length (%lu vs %lu)",
			    sha1_to_hex(entry->sha1), size, entry->size);
		n->index = create_delta_index(n->data, size);
		if (!n->index)
			die("out of memory");
1118

L
Linus Torvalds 已提交
1119 1120 1121
		j = window;
		while (--j > 0) {
			unsigned int other_idx = idx + j;
1122
			struct unpacked *m;
L
Linus Torvalds 已提交
1123 1124
			if (other_idx >= window)
				other_idx -= window;
1125 1126 1127
			m = array + other_idx;
			if (!m->entry)
				break;
1128
			if (try_delta(n, m, m->index, depth) < 0)
1129 1130
				break;
		}
1131 1132 1133 1134 1135 1136 1137 1138 1139
#if 0
		/* if we made n a delta, and if n is already at max
		 * depth, leaving it in the window is pointless.  we
		 * should evict it first.
		 * ... in theory only; somehow this makes things worse.
		 */
		if (entry->delta && depth <= entry->depth)
			continue;
#endif
1140 1141 1142
		idx++;
		if (idx >= window)
			idx = 0;
1143
	}
1144

1145 1146 1147
	if (progress)
		fputc('\n', stderr);

1148 1149 1150
	for (i = 0; i < window; ++i) {
		if (array[i].index)
			free_delta_index(array[i].index);
1151
		free(array[i].data);
1152
	}
1153
	free(array);
1154 1155
}

1156 1157
static void prepare_pack(int window, int depth)
{
1158
	get_object_details();
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
	sorted_by_type = create_sorted_list(type_size_sort);
	if (window && depth)
		find_deltas(sorted_by_type, window+1, depth);
}

static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout)
{
	static const char cache[] = "pack-cache/pack-%s.%s";
	char *cached_pack, *cached_idx;
	int ifd, ofd, ifd_ix = -1;

	cached_pack = git_path(cache, sha1_to_hex(sha1), "pack");
	ifd = open(cached_pack, O_RDONLY);
	if (ifd < 0)
		return 0;

	if (!pack_to_stdout) {
		cached_idx = git_path(cache, sha1_to_hex(sha1), "idx");
		ifd_ix = open(cached_idx, O_RDONLY);
		if (ifd_ix < 0) {
			close(ifd);
			return 0;
		}
	}

1184 1185 1186
	if (progress)
		fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects,
			sha1_to_hex(sha1));
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217

	if (pack_to_stdout) {
		if (copy_fd(ifd, 1))
			exit(1);
		close(ifd);
	}
	else {
		char name[PATH_MAX];
		snprintf(name, sizeof(name),
			 "%s-%s.%s", base_name, sha1_to_hex(sha1), "pack");
		ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
		if (ofd < 0)
			die("unable to open %s (%s)", name, strerror(errno));
		if (copy_fd(ifd, ofd))
			exit(1);
		close(ifd);

		snprintf(name, sizeof(name),
			 "%s-%s.%s", base_name, sha1_to_hex(sha1), "idx");
		ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
		if (ofd < 0)
			die("unable to open %s (%s)", name, strerror(errno));
		if (copy_fd(ifd_ix, ofd))
			exit(1);
		close(ifd_ix);
		puts(sha1_to_hex(sha1));
	}

	return 1;
}

1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234
static void setup_progress_signal(void)
{
	struct sigaction sa;
	struct itimerval v;

	memset(&sa, 0, sizeof(sa));
	sa.sa_handler = progress_interval;
	sigemptyset(&sa.sa_mask);
	sa.sa_flags = SA_RESTART;
	sigaction(SIGALRM, &sa, NULL);

	v.it_interval.tv_sec = 1;
	v.it_interval.tv_usec = 0;
	v.it_value = v.it_interval;
	setitimer(ITIMER_REAL, &v, NULL);
}

1235 1236
int main(int argc, char **argv)
{
1237
	SHA_CTX ctx;
N
Nicolas Pitre 已提交
1238
	char line[40 + 1 + PATH_MAX + 2];
1239
	int window = 10, depth = 10, pack_to_stdout = 0;
J
Junio C Hamano 已提交
1240
	struct object_entry **list;
1241
	int num_preferred_base = 0;
1242 1243
	int i;

1244 1245
	setup_git_directory();

1246
	progress = isatty(2);
1247 1248 1249 1250
	for (i = 1; i < argc; i++) {
		const char *arg = argv[i];

		if (*arg == '-') {
1251 1252 1253 1254
			if (!strcmp("--non-empty", arg)) {
				non_empty = 1;
				continue;
			}
L
Linus Torvalds 已提交
1255 1256 1257 1258
			if (!strcmp("--local", arg)) {
				local = 1;
				continue;
			}
1259 1260 1261 1262
			if (!strcmp("--incremental", arg)) {
				incremental = 1;
				continue;
			}
1263 1264
			if (!strncmp("--window=", arg, 9)) {
				char *end;
1265
				window = strtoul(arg+9, &end, 0);
1266 1267 1268 1269
				if (!arg[9] || *end)
					usage(pack_usage);
				continue;
			}
1270 1271 1272 1273 1274 1275 1276
			if (!strncmp("--depth=", arg, 8)) {
				char *end;
				depth = strtoul(arg+8, &end, 0);
				if (!arg[8] || *end)
					usage(pack_usage);
				continue;
			}
1277 1278 1279 1280
			if (!strcmp("--progress", arg)) {
				progress = 1;
				continue;
			}
J
Junio C Hamano 已提交
1281 1282 1283 1284
			if (!strcmp("-q", arg)) {
				progress = 0;
				continue;
			}
1285 1286 1287 1288
			if (!strcmp("--no-reuse-delta", arg)) {
				no_reuse_delta = 1;
				continue;
			}
1289 1290 1291 1292
			if (!strcmp("--stdout", arg)) {
				pack_to_stdout = 1;
				continue;
			}
1293 1294 1295 1296 1297 1298 1299
			usage(pack_usage);
		}
		if (base_name)
			usage(pack_usage);
		base_name = arg;
	}

1300
	if (pack_to_stdout != !base_name)
1301 1302
		usage(pack_usage);

L
Linus Torvalds 已提交
1303
	prepare_packed_git();
1304

1305 1306
	if (progress) {
		fprintf(stderr, "Generating pack...\n");
1307
		setup_progress_signal();
1308
	}
1309

1310
	for (;;) {
1311
		unsigned char sha1[20];
1312
		unsigned hash;
1313

1314 1315 1316 1317 1318
		if (!fgets(line, sizeof(line), stdin)) {
			if (feof(stdin))
				break;
			if (!ferror(stdin))
				die("fgets returned NULL, not EOF, not error!");
1319 1320 1321 1322
			if (errno != EINTR)
				die("fgets: %s", strerror(errno));
			clearerr(stdin);
			continue;
1323 1324
		}

1325 1326 1327 1328
		if (line[0] == '-') {
			if (get_sha1_hex(line+1, sha1))
				die("expected edge sha1, got garbage:\n %s",
				    line+1);
1329 1330
			if (num_preferred_base++ < window)
				add_preferred_base(sha1);
1331
			continue;
1332
		}
1333
		if (get_sha1_hex(line, sha1))
1334
			die("expected sha1, got garbage:\n %s", line);
1335 1336 1337
		hash = name_hash(NULL, line+41);
		add_preferred_base_object(line+41, hash);
		add_object_entry(sha1, hash, 0);
1338
	}
1339 1340
	if (progress)
		fprintf(stderr, "Done counting %d objects.\n", nr_objects);
1341 1342
	sorted_by_sha = create_final_object_list();
	if (non_empty && !nr_result)
1343
		return 0;
1344

J
Junio C Hamano 已提交
1345 1346
	SHA1_Init(&ctx);
	list = sorted_by_sha;
1347
	for (i = 0; i < nr_result; i++) {
J
Junio C Hamano 已提交
1348 1349 1350 1351
		struct object_entry *entry = *list++;
		SHA1_Update(&ctx, entry->sha1, 20);
	}
	SHA1_Final(object_list_sha1, &ctx);
1352 1353
	if (progress && (nr_objects != nr_result))
		fprintf(stderr, "Result has %d objects.\n", nr_result);
J
Junio C Hamano 已提交
1354

1355 1356 1357
	if (reuse_cached_pack(object_list_sha1, pack_to_stdout))
		;
	else {
1358 1359
		if (nr_result)
			prepare_pack(window, depth);
1360 1361 1362 1363 1364 1365 1366 1367
		if (progress && pack_to_stdout) {
			/* the other end usually displays progress itself */
			struct itimerval v = {{0,},};
			setitimer(ITIMER_REAL, &v, NULL);
			signal(SIGALRM, SIG_IGN );
			progress_update = 0;
		}
		write_pack_file();
1368 1369 1370 1371
		if (!pack_to_stdout) {
			write_index_file();
			puts(sha1_to_hex(object_list_sha1));
		}
1372
	}
1373 1374
	if (progress)
		fprintf(stderr, "Total %d, written %d (delta %d), reused %d (delta %d)\n",
1375
			nr_result, written, written_delta, reused, reused_delta);
1376 1377
	return 0;
}