pack-objects.c 28.4 KB
Newer Older
1 2 3
#include "cache.h"
#include "object.h"
#include "delta.h"
4
#include "pack.h"
5
#include "csum-file.h"
J
Junio C Hamano 已提交
6
#include "tree-walk.h"
7
#include <sys/time.h>
8
#include <signal.h>
9

10
static const char pack_usage[] = "git-pack-objects [-q] [--no-reuse-delta] [--non-empty] [--local] [--incremental] [--window=N] [--depth=N] {--stdout | base-name} < object-list";
11 12 13

struct object_entry {
	unsigned char sha1[20];
14
	unsigned long size;	/* uncompressed size */
15 16 17
	unsigned long offset;	/* offset into the final pack file;
				 * nonzero if already written.
				 */
18
	unsigned int depth;	/* delta depth */
19
	unsigned int delta_limit;	/* base adjustment for in-pack delta */
20
	unsigned int hash;	/* name hint hash */
21
	enum object_type type;
22
	enum object_type in_pack_type;	/* could be delta */
23 24 25 26
	unsigned long delta_size;	/* delta data size (uncompressed) */
	struct object_entry *delta;	/* delta base object */
	struct packed_git *in_pack; 	/* already in pack */
	unsigned int in_pack_offset;
27 28 29 30
	struct object_entry *delta_child; /* delitified objects who bases me */
	struct object_entry *delta_sibling; /* other deltified objects who
					     * uses the same base as me
					     */
31 32 33 34
	int preferred_base;	/* we do not pack this, but is encouraged to
				 * be used as the base objectto delta huge
				 * objects against.
				 */
35 36
};

37 38 39 40 41 42 43 44 45 46 47 48 49
/*
 * Objects we are going to pack are colected in objects array (dynamically
 * expanded).  nr_objects & nr_alloc controls this array.  They are stored
 * in the order we see -- typically rev-list --objects order that gives us
 * nice "minimum seek" order.
 *
 * sorted-by-sha ans sorted-by-type are arrays of pointers that point at
 * elements in the objects array.  The former is used to build the pack
 * index (lists object names in the ascending order to help offset lookup),
 * and the latter is used to group similar things together by try_delta()
 * heuristics.
 */

50
static unsigned char object_list_sha1[20];
51
static int non_empty = 0;
52
static int no_reuse_delta = 0;
L
Linus Torvalds 已提交
53
static int local = 0;
54
static int incremental = 0;
55 56
static struct object_entry **sorted_by_sha, **sorted_by_type;
static struct object_entry *objects = NULL;
57
static int nr_objects = 0, nr_alloc = 0, nr_result = 0;
58
static const char *base_name;
59
static unsigned char pack_file_sha1[20];
J
Junio C Hamano 已提交
60
static int progress = 1;
61
static volatile int progress_update = 0;
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
/*
 * The object names in objects array are hashed with this hashtable,
 * to help looking up the entry by object name.  Binary search from
 * sorted_by_sha is also possible but this was easier to code and faster.
 * This hashtable is built after all the objects are seen.
 */
static int *object_ix = NULL;
static int object_ix_hashsz = 0;

/*
 * Pack index for existing packs give us easy access to the offsets into
 * corresponding pack file where each object's data starts, but the entries
 * do not store the size of the compressed representation (uncompressed
 * size is easily available by examining the pack entry header).  We build
 * a hashtable of existing packs (pack_revindex), and keep reverse index
 * here -- pack index file is sorted by object name mapping to offset; this
 * pack_revindex[].revindex array is an ordered list of offsets, so if you
 * know the offset of an object, next offset is where its packed
 * representation ends.
 */
struct pack_revindex {
	struct packed_git *p;
	unsigned long *revindex;
} *pack_revindex = NULL;
static int pack_revindex_hashsz = 0;

/*
 * stats
 */
static int written = 0;
93
static int written_delta = 0;
94
static int reused = 0;
95
static int reused_delta = 0;
96 97 98

static int pack_revindex_ix(struct packed_git *p)
{
L
Luck, Tony 已提交
99
	unsigned long ui = (unsigned long)p;
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
	int i;

	ui = ui ^ (ui >> 16); /* defeat structure alignment */
	i = (int)(ui % pack_revindex_hashsz);
	while (pack_revindex[i].p) {
		if (pack_revindex[i].p == p)
			return i;
		if (++i == pack_revindex_hashsz)
			i = 0;
	}
	return -1 - i;
}

static void prepare_pack_ix(void)
{
	int num;
	struct packed_git *p;
	for (num = 0, p = packed_git; p; p = p->next)
		num++;
	if (!num)
		return;
	pack_revindex_hashsz = num * 11;
	pack_revindex = xcalloc(sizeof(*pack_revindex), pack_revindex_hashsz);
	for (p = packed_git; p; p = p->next) {
		num = pack_revindex_ix(p);
		num = - 1 - num;
		pack_revindex[num].p = p;
	}
	/* revindex elements are lazily initialized */
}

static int cmp_offset(const void *a_, const void *b_)
{
	unsigned long a = *(unsigned long *) a_;
	unsigned long b = *(unsigned long *) b_;
	if (a < b)
		return -1;
	else if (a == b)
		return 0;
	else
		return 1;
}

/*
 * Ordered list of offsets of objects in the pack.
 */
static void prepare_pack_revindex(struct pack_revindex *rix)
{
	struct packed_git *p = rix->p;
	int num_ent = num_packed_objects(p);
	int i;
	void *index = p->index_base + 256;

	rix->revindex = xmalloc(sizeof(unsigned long) * (num_ent + 1));
	for (i = 0; i < num_ent; i++) {
		long hl = *((long *)(index + 24 * i));
		rix->revindex[i] = ntohl(hl);
	}
	/* This knows the pack format -- the 20-byte trailer
	 * follows immediately after the last object data.
	 */
	rix->revindex[num_ent] = p->pack_size - 20;
	qsort(rix->revindex, num_ent, sizeof(unsigned long), cmp_offset);
}

static unsigned long find_packed_object_size(struct packed_git *p,
					     unsigned long ofs)
{
	int num;
	int lo, hi;
	struct pack_revindex *rix;
	unsigned long *revindex;
	num = pack_revindex_ix(p);
	if (num < 0)
		die("internal error: pack revindex uninitialized");
	rix = &pack_revindex[num];
	if (!rix->revindex)
		prepare_pack_revindex(rix);
	revindex = rix->revindex;
	lo = 0;
	hi = num_packed_objects(p) + 1;
	do {
		int mi = (lo + hi) / 2;
		if (revindex[mi] == ofs) {
			return revindex[mi+1] - ofs;
		}
		else if (ofs < revindex[mi])
			hi = mi;
		else
			lo = mi + 1;
	} while (lo < hi);
	die("internal error: pack revindex corrupt");
}

194 195 196 197 198 199 200 201 202
static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
	unsigned long othersize, delta_size;
	char type[10];
	void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
	void *delta_buf;

	if (!otherbuf)
		die("unable to read %s", sha1_to_hex(entry->delta->sha1));
203
        delta_buf = diff_delta(otherbuf, othersize,
204
			       buf, size, &delta_size, 0);
205
        if (!delta_buf || delta_size != entry->delta_size)
206 207 208 209 210 211
        	die("delta size changed");
        free(buf);
        free(otherbuf);
	return delta_buf;
}

212 213 214 215 216 217 218 219 220
/*
 * The per-object header is a pretty dense thing, which is
 *  - first byte: low four bits are "size", then three bits of "type",
 *    and the high bit is "size continues".
 *  - each byte afterwards: low seven bits are size continuation,
 *    with the high bit being "size continues"
 */
static int encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
{
221
	int n = 1;
222 223 224 225 226
	unsigned char c;

	if (type < OBJ_COMMIT || type > OBJ_DELTA)
		die("bad type %d", type);

227 228 229
	c = (type << 4) | (size & 15);
	size >>= 4;
	while (size) {
230
		*hdr++ = c | 0x80;
231 232 233
		c = size & 0x7f;
		size >>= 7;
		n++;
234 235 236 237 238
	}
	*hdr = c;
	return n;
}

239 240
static unsigned long write_object(struct sha1file *f,
				  struct object_entry *entry)
241 242 243
{
	unsigned long size;
	char type[10];
244
	void *buf;
245
	unsigned char header[10];
246
	unsigned hdrlen, datalen;
247
	enum object_type obj_type;
248
	int to_reuse = 0;
249

250 251
	if (entry->preferred_base)
		return 0;
252

253
	obj_type = entry->type;
254 255 256 257 258 259 260 261 262 263 264 265 266 267
	if (! entry->in_pack)
		to_reuse = 0;	/* can't reuse what we don't have */
	else if (obj_type == OBJ_DELTA)
		to_reuse = 1;	/* check_object() decided it for us */
	else if (obj_type != entry->in_pack_type)
		to_reuse = 0;	/* pack has delta which is unusable */
	else if (entry->delta)
		to_reuse = 0;	/* we want to pack afresh */
	else
		to_reuse = 1;	/* we have it in-pack undeltified,
				 * and we do not need to deltify it.
				 */

	if (! to_reuse) {
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
		buf = read_sha1_file(entry->sha1, type, &size);
		if (!buf)
			die("unable to read %s", sha1_to_hex(entry->sha1));
		if (size != entry->size)
			die("object %s size inconsistency (%lu vs %lu)",
			    sha1_to_hex(entry->sha1), size, entry->size);
		if (entry->delta) {
			buf = delta_against(buf, size, entry);
			size = entry->delta_size;
			obj_type = OBJ_DELTA;
		}
		/*
		 * The object header is a byte of 'type' followed by zero or
		 * more bytes of length.  For deltas, the 20 bytes of delta
		 * sha1 follows that.
		 */
		hdrlen = encode_header(obj_type, size, header);
		sha1write(f, header, hdrlen);

		if (entry->delta) {
			sha1write(f, entry->delta, 20);
			hdrlen += 20;
		}
		datalen = sha1write_compressed(f, buf, size);
		free(buf);
293
	}
294 295 296 297 298 299 300 301 302
	else {
		struct packed_git *p = entry->in_pack;
		use_packed_git(p);

		datalen = find_packed_object_size(p, entry->in_pack_offset);
		buf = p->pack_base + entry->in_pack_offset;
		sha1write(f, buf, datalen);
		unuse_packed_git(p);
		hdrlen = 0; /* not really */
303 304
		if (obj_type == OBJ_DELTA)
			reused_delta++;
305
		reused++;
306
	}
307 308
	if (obj_type == OBJ_DELTA)
		written_delta++;
309
	written++;
310 311 312
	return hdrlen + datalen;
}

313 314 315 316 317 318 319 320 321 322 323
static unsigned long write_one(struct sha1file *f,
			       struct object_entry *e,
			       unsigned long offset)
{
	if (e->offset)
		/* offset starts from header size and cannot be zero
		 * if it is written already.
		 */
		return offset;
	e->offset = offset;
	offset += write_object(f, e);
J
Junio C Hamano 已提交
324
	/* if we are deltified, write out its base object. */
325 326 327 328 329
	if (e->delta)
		offset = write_one(f, e->delta, offset);
	return offset;
}

330 331 332
static void write_pack_file(void)
{
	int i;
333
	struct sha1file *f;
334 335
	unsigned long offset;
	struct pack_header hdr;
336 337
	unsigned last_percent = 999;
	int do_progress = 0;
338

339 340
	if (!base_name)
		f = sha1fd(1, "<stdout>");
341 342 343 344 345 346
	else {
		f = sha1create("%s-%s.%s", base_name,
			       sha1_to_hex(object_list_sha1), "pack");
		do_progress = progress;
	}
	if (do_progress)
347
		fprintf(stderr, "Writing %d objects.\n", nr_result);
348

349
	hdr.hdr_signature = htonl(PACK_SIGNATURE);
350
	hdr.hdr_version = htonl(PACK_VERSION);
351
	hdr.hdr_entries = htonl(nr_result);
352 353
	sha1write(f, &hdr, sizeof(hdr));
	offset = sizeof(hdr);
354 355
	if (!nr_result)
		goto done;
356
	for (i = 0; i < nr_objects; i++) {
357
		offset = write_one(f, objects + i, offset);
358
		if (do_progress) {
359
			unsigned percent = written * 100 / nr_result;
360 361
			if (progress_update || percent != last_percent) {
				fprintf(stderr, "%4u%% (%u/%u) done\r",
362
					percent, written, nr_result);
363 364 365
				progress_update = 0;
				last_percent = percent;
			}
366 367
		}
	}
368 369
	if (do_progress)
		fputc('\n', stderr);
370
 done:
371
	sha1close(f, pack_file_sha1, 1);
372 373 374 375 376
}

static void write_index_file(void)
{
	int i;
377 378
	struct sha1file *f = sha1create("%s-%s.%s", base_name,
					sha1_to_hex(object_list_sha1), "idx");
379
	struct object_entry **list = sorted_by_sha;
380
	struct object_entry **last = list + nr_result;
381 382 383 384 385
	unsigned int array[256];

	/*
	 * Write the first-level table (the list is sorted,
	 * but we use a 256-entry lookup to be able to avoid
386
	 * having to do eight extra binary search iterations).
387 388 389 390 391 392 393 394 395 396 397 398
	 */
	for (i = 0; i < 256; i++) {
		struct object_entry **next = list;
		while (next < last) {
			struct object_entry *entry = *next;
			if (entry->sha1[0] != i)
				break;
			next++;
		}
		array[i] = htonl(next - sorted_by_sha);
		list = next;
	}
399
	sha1write(f, array, 256 * sizeof(int));
400 401 402 403 404

	/*
	 * Write the actual SHA1 entries..
	 */
	list = sorted_by_sha;
405
	for (i = 0; i < nr_result; i++) {
406 407
		struct object_entry *entry = *list++;
		unsigned int offset = htonl(entry->offset);
408 409
		sha1write(f, &offset, 4);
		sha1write(f, entry->sha1, 20);
410
	}
411 412
	sha1write(f, pack_file_sha1, 20);
	sha1close(f, NULL, 1);
413 414
}

415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
static int locate_object_entry_hash(const unsigned char *sha1)
{
	int i;
	unsigned int ui;
	memcpy(&ui, sha1, sizeof(unsigned int));
	i = ui % object_ix_hashsz;
	while (0 < object_ix[i]) {
		if (!memcmp(sha1, objects[object_ix[i]-1].sha1, 20))
			return i;
		if (++i == object_ix_hashsz)
			i = 0;
	}
	return -1 - i;
}

static struct object_entry *locate_object_entry(const unsigned char *sha1)
{
	int i;

	if (!object_ix_hashsz)
		return NULL;

	i = locate_object_entry_hash(sha1);
	if (0 <= i)
		return &objects[object_ix[i]-1];
	return NULL;
}

static void rehash_objects(void)
444
{
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
	int i;
	struct object_entry *oe;

	object_ix_hashsz = nr_objects * 3;
	if (object_ix_hashsz < 1024)
		object_ix_hashsz = 1024;
	object_ix = xrealloc(object_ix, sizeof(int) * object_ix_hashsz);
	object_ix = memset(object_ix, 0, sizeof(int) * object_ix_hashsz);
	for (i = 0, oe = objects; i < nr_objects; i++, oe++) {
		int ix = locate_object_entry_hash(oe->sha1);
		if (0 <= ix)
			continue;
		ix = -1 - ix;
		object_ix[ix] = i + 1;
	}
}

462 463 464 465 466 467
struct name_path {
	struct name_path *up;
	const char *elem;
	int len;
};

468 469
#define DIRBITS 12

470 471 472 473
static unsigned name_hash(struct name_path *path, const char *name)
{
	struct name_path *p = path;
	const char *n = name + strlen(name);
474
	unsigned hash = 0, name_hash = 0, name_done = 0;
475 476 477 478 479

	if (n != name && n[-1] == '\n')
		n--;
	while (name <= --n) {
		unsigned char c = *n;
480 481 482 483 484
		if (c == '/' && !name_done) {
			name_hash = hash;
			name_done = 1;
			hash = 0;
		}
485 486
		hash = hash * 11 + c;
	}
487 488 489 490
	if (!name_done) {
		name_hash = hash;
		hash = 0;
	}
491 492 493 494 495 496 497 498
	for (p = path; p; p = p->up) {
		hash = hash * 11 + '/';
		n = p->elem + p->len;
		while (p->elem <= --n) {
			unsigned char c = *n;
			hash = hash * 11 + c;
		}
	}
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518
	/*
	 * Make sure "Makefile" and "t/Makefile" are hashed separately
	 * but close enough.
	 */
	hash = (name_hash<<DIRBITS) | (hash & ((1U<<DIRBITS )-1));

	if (0) { /* debug */
		n = name + strlen(name);
		if (n != name && n[-1] == '\n')
			n--;
		while (name <= --n)
			fputc(*n, stderr);
		for (p = path; p; p = p->up) {
			fputc('/', stderr);
			n = p->elem + p->len;
			while (p->elem <= --n)
				fputc(*n, stderr);
		}
		fprintf(stderr, "\t%08x\n", hash);
	}
519 520 521 522
	return hash;
}

static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude)
523 524 525
{
	unsigned int idx = nr_objects;
	struct object_entry *entry;
526
	struct packed_git *p;
527 528
	unsigned int found_offset = 0;
	struct packed_git *found_pack = NULL;
529
	int ix, status = 0;
530

531
	if (!exclude) {
L
Linus Torvalds 已提交
532 533 534 535 536 537 538
		for (p = packed_git; p; p = p->next) {
			struct pack_entry e;
			if (find_pack_entry_one(sha1, &e, p)) {
				if (incremental)
					return 0;
				if (local && !p->pack_local)
					return 0;
539 540 541 542
				if (!found_pack) {
					found_offset = e.offset;
					found_pack = e.p;
				}
L
Linus Torvalds 已提交
543 544 545
			}
		}
	}
546 547
	if ((entry = locate_object_entry(sha1)) != NULL)
		goto already_added;
548

549 550 551 552 553 554
	if (idx >= nr_alloc) {
		unsigned int needed = (idx + 1024) * 3 / 2;
		objects = xrealloc(objects, needed * sizeof(*entry));
		nr_alloc = needed;
	}
	entry = objects + idx;
555
	nr_objects = idx + 1;
556 557
	memset(entry, 0, sizeof(*entry));
	memcpy(entry->sha1, sha1, 20);
558
	entry->hash = hash;
559 560 561 562 563 564 565 566

	if (object_ix_hashsz * 3 <= nr_objects * 4)
		rehash_objects();
	else {
		ix = locate_object_entry_hash(entry->sha1);
		if (0 <= ix)
			die("internal error in object hashing.");
		object_ix[-1 - ix] = idx + 1;
567
	}
568
	status = 1;
569 570

 already_added:
571 572 573 574
	if (progress_update) {
		fprintf(stderr, "Counting objects...%d\r", nr_objects);
		progress_update = 0;
	}
575 576 577 578 579 580 581
	if (exclude)
		entry->preferred_base = 1;
	else {
		if (found_pack) {
			entry->in_pack = found_pack;
			entry->in_pack_offset = found_offset;
		}
582
	}
583
	return status;
584 585
}

586
static void add_pbase_tree(struct tree_desc *tree, struct name_path *up)
587
{
588 589 590
	while (tree->size) {
		const unsigned char *sha1;
		const char *name;
591
		unsigned mode, hash;
592 593 594 595 596 597 598 599 600
		unsigned long size;
		char type[20];

		sha1 = tree_entry_extract(tree, &name, &mode);
		update_tree_entry(tree);
		if (!has_sha1_file(sha1))
			continue;
		if (sha1_object_info(sha1, type, &size))
			continue;
601

602 603
		hash = name_hash(up, name);
		if (!add_object_entry(sha1, hash, 1))
604 605
			continue;

606 607 608
		if (!strcmp(type, "tree")) {
			struct tree_desc sub;
			void *elem;
609 610
			struct name_path me;

611 612 613
			elem = read_sha1_file(sha1, type, &sub.size);
			sub.buf = elem;
			if (sub.buf) {
614 615 616 617
				me.up = up;
				me.elem = name;
				me.len = strlen(name);
				add_pbase_tree(&sub, &me);
618 619 620
				free(elem);
			}
		}
621 622 623
	}
}

624
static void add_preferred_base(unsigned char *sha1)
625
{
626 627
	struct tree_desc tree;
	void *elem;
628

629 630 631 632
	elem = read_object_with_reference(sha1, "tree", &tree.size, NULL);
	tree.buf = elem;
	if (!tree.buf)
		return;
633 634
	if (add_object_entry(sha1, name_hash(NULL, ""), 1))
		add_pbase_tree(&tree, NULL);
635
	free(elem);
636 637
}

638 639
static void check_object(struct object_entry *entry)
{
640 641
	char type[20];

642
	if (entry->in_pack && !entry->preferred_base) {
643 644 645 646 647 648 649 650 651 652 653 654
		unsigned char base[20];
		unsigned long size;
		struct object_entry *base_entry;

		/* We want in_pack_type even if we do not reuse delta.
		 * There is no point not reusing non-delta representations.
		 */
		check_reuse_pack_delta(entry->in_pack,
				       entry->in_pack_offset,
				       base, &size,
				       &entry->in_pack_type);

655 656 657 658
		/* Check if it is delta, and the base is also an object
		 * we are going to pack.  If so we will reuse the existing
		 * delta.
		 */
659 660
		if (!no_reuse_delta &&
		    entry->in_pack_type == OBJ_DELTA &&
661 662
		    (base_entry = locate_object_entry(base)) &&
		    (!base_entry->preferred_base)) {
663 664 665 666 667

			/* Depth value does not matter - find_deltas()
			 * will never consider reused delta as the
			 * base object to deltify other objects
			 * against, in order to avoid circular deltas.
668
			 */
669 670

			/* uncompressed size of the delta data */
671 672 673
			entry->size = entry->delta_size = size;
			entry->delta = base_entry;
			entry->type = OBJ_DELTA;
674

675 676
			entry->delta_sibling = base_entry->delta_child;
			base_entry->delta_child = entry;
677

678 679 680
			return;
		}
		/* Otherwise we would do the usual */
681
	}
682 683

	if (sha1_object_info(entry->sha1, type, &entry->size))
684 685
		die("unable to get type of object %s",
		    sha1_to_hex(entry->sha1));
686 687 688 689 690 691 692 693 694 695 696 697 698 699

	if (!strcmp(type, "commit")) {
		entry->type = OBJ_COMMIT;
	} else if (!strcmp(type, "tree")) {
		entry->type = OBJ_TREE;
	} else if (!strcmp(type, "blob")) {
		entry->type = OBJ_BLOB;
	} else if (!strcmp(type, "tag")) {
		entry->type = OBJ_TAG;
	} else
		die("unable to pack object %s of type %s",
		    sha1_to_hex(entry->sha1), type);
}

700 701 702 703 704 705 706 707 708 709 710 711 712
static unsigned int check_delta_limit(struct object_entry *me, unsigned int n)
{
	struct object_entry *child = me->delta_child;
	unsigned int m = n;
	while (child) {
		unsigned int c = check_delta_limit(child, n + 1);
		if (m < c)
			m = c;
		child = child->delta_sibling;
	}
	return m;
}

713 714 715
static void get_object_details(void)
{
	int i;
716
	struct object_entry *entry;
717

718
	prepare_pack_ix();
719 720
	for (i = 0, entry = objects; i < nr_objects; i++, entry++)
		check_object(entry);
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737

	if (nr_objects == nr_result) {
		/*
		 * Depth of objects that depend on the entry -- this
		 * is subtracted from depth-max to break too deep
		 * delta chain because of delta data reusing.
		 * However, we loosen this restriction when we know we
		 * are creating a thin pack -- it will have to be
		 * expanded on the other end anyway, so do not
		 * artificially cut the delta chain and let it go as
		 * deep as it wants.
		 */
		for (i = 0, entry = objects; i < nr_objects; i++, entry++)
			if (!entry->delta && entry->delta_child)
				entry->delta_limit =
					check_delta_limit(entry, 1);
	}
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767
}

typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);

static entry_sort_t current_sort;

static int sort_comparator(const void *_a, const void *_b)
{
	struct object_entry *a = *(struct object_entry **)_a;
	struct object_entry *b = *(struct object_entry **)_b;
	return current_sort(a,b);
}

static struct object_entry **create_sorted_list(entry_sort_t sort)
{
	struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
	int i;

	for (i = 0; i < nr_objects; i++)
		list[i] = objects + i;
	current_sort = sort;
	qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
	return list;
}

static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
{
	return memcmp(a->sha1, b->sha1, 20);
}

T
Timo Hirvonen 已提交
768
static struct object_entry **create_final_object_list(void)
769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
{
	struct object_entry **list;
	int i, j;

	for (i = nr_result = 0; i < nr_objects; i++)
		if (!objects[i].preferred_base)
			nr_result++;
	list = xmalloc(nr_result * sizeof(struct object_entry *));
	for (i = j = 0; i < nr_objects; i++) {
		if (!objects[i].preferred_base)
			list[j++] = objects + i;
	}
	current_sort = sha1_sort;
	qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator);
	return list;
}

786 787 788 789 790 791
static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
{
	if (a->type < b->type)
		return -1;
	if (a->type > b->type)
		return 1;
792 793 794 795
	if (a->hash < b->hash)
		return -1;
	if (a->hash > b->hash)
		return 1;
796 797 798 799
	if (a->preferred_base < b->preferred_base)
		return -1;
	if (a->preferred_base > b->preferred_base)
		return 1;
800 801 802 803 804 805 806 807 808 809 810 811 812
	if (a->size < b->size)
		return -1;
	if (a->size > b->size)
		return 1;
	return a < b ? -1 : (a > b);
}

struct unpacked {
	struct object_entry *entry;
	void *data;
};

/*
813 814 815 816 817 818
 * We search for deltas _backwards_ in a list sorted by type and
 * by size, so that we see progressively smaller and smaller files.
 * That's because we prefer deltas to be from the bigger file
 * to the smaller - deletes are potentially cheaper, but perhaps
 * more importantly, the bigger file is likely the more recent
 * one.
819
 */
820
static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_depth)
821 822 823
{
	struct object_entry *cur_entry = cur->entry;
	struct object_entry *old_entry = old->entry;
824
	unsigned long size, oldsize, delta_size, sizediff;
825
	long max_size;
826 827 828 829 830 831
	void *delta_buf;

	/* Don't bother doing diffs between different types */
	if (cur_entry->type != old_entry->type)
		return -1;

832 833 834 835
	/* We do not compute delta to *create* objects we are not
	 * going to pack.
	 */
	if (cur_entry->preferred_base)
836
		return -1;
837 838 839 840

	/* If the current object is at pack edge, take the depth the
	 * objects that depend on the current object into account --
	 * otherwise they would become too deep.
841
	 */
842 843 844 845 846
	if (cur_entry->delta_child) {
		if (max_depth <= cur_entry->delta_limit)
			return 0;
		max_depth -= cur_entry->delta_limit;
	}
847

848 849
	size = cur_entry->size;
	oldsize = old_entry->size;
850
	sizediff = oldsize > size ? oldsize - size : size - oldsize;
851 852

	if (size < 50)
853
		return -1;
854 855
	if (old_entry->depth >= max_depth)
		return 0;
856 857 858 859 860 861 862 863

	/*
	 * NOTE!
	 *
	 * We always delta from the bigger to the smaller, since that's
	 * more space-efficient (deletes don't have to say _what_ they
	 * delete).
	 */
864
	max_size = size / 2 - 20;
865 866
	if (cur_entry->delta)
		max_size = cur_entry->delta_size-1;
867 868
	if (sizediff >= max_size)
		return -1;
869 870
	delta_buf = diff_delta(old->data, oldsize,
			       cur->data, size, &delta_size, max_size);
871
	if (!delta_buf)
872 873 874
		return 0;
	cur_entry->delta = old_entry;
	cur_entry->delta_size = delta_size;
875
	cur_entry->depth = old_entry->depth + 1;
876
	free(delta_buf);
877
	return 0;
878 879
}

880 881 882 883 884 885
static void progress_interval(int signum)
{
	signal(SIGALRM, progress_interval);
	progress_update = 1;
}

886
static void find_deltas(struct object_entry **list, int window, int depth)
887
{
888
	int i, idx;
889 890
	unsigned int array_size = window * sizeof(struct unpacked);
	struct unpacked *array = xmalloc(array_size);
891 892
	unsigned processed = 0;
	unsigned last_percent = 999;
893 894

	memset(array, 0, array_size);
895 896
	i = nr_objects;
	idx = 0;
897
	if (progress)
898
		fprintf(stderr, "Deltifying %d objects.\n", nr_result);
899

900
	while (--i >= 0) {
901 902 903 904 905 906
		struct object_entry *entry = list[i];
		struct unpacked *n = array + idx;
		unsigned long size;
		char type[10];
		int j;

907 908 909
		if (!entry->preferred_base)
			processed++;

910
		if (progress) {
911
			unsigned percent = processed * 100 / nr_result;
912 913
			if (percent != last_percent || progress_update) {
				fprintf(stderr, "%4u%% (%u/%u) done\r",
914
					percent, processed, nr_result);
915 916 917
				progress_update = 0;
				last_percent = percent;
			}
918
		}
919 920 921

		if (entry->delta)
			/* This happens if we decided to reuse existing
922
			 * delta from a pack.  "!no_reuse_delta &&" is implied.
923 924 925
			 */
			continue;

926 927 928 929 930
		free(n->data);
		n->entry = entry;
		n->data = read_sha1_file(entry->sha1, type, &size);
		if (size != entry->size)
			die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
931

L
Linus Torvalds 已提交
932 933 934
		j = window;
		while (--j > 0) {
			unsigned int other_idx = idx + j;
935
			struct unpacked *m;
L
Linus Torvalds 已提交
936 937
			if (other_idx >= window)
				other_idx -= window;
938 939 940
			m = array + other_idx;
			if (!m->entry)
				break;
941
			if (try_delta(n, m, depth) < 0)
942 943
				break;
		}
944 945 946 947 948 949 950 951 952
#if 0
		/* if we made n a delta, and if n is already at max
		 * depth, leaving it in the window is pointless.  we
		 * should evict it first.
		 * ... in theory only; somehow this makes things worse.
		 */
		if (entry->delta && depth <= entry->depth)
			continue;
#endif
953 954 955
		idx++;
		if (idx >= window)
			idx = 0;
956
	}
957

958 959 960
	if (progress)
		fputc('\n', stderr);

961 962 963
	for (i = 0; i < window; ++i)
		free(array[i].data);
	free(array);
964 965
}

966 967
static void prepare_pack(int window, int depth)
{
968
	get_object_details();
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
	sorted_by_type = create_sorted_list(type_size_sort);
	if (window && depth)
		find_deltas(sorted_by_type, window+1, depth);
}

static int reuse_cached_pack(unsigned char *sha1, int pack_to_stdout)
{
	static const char cache[] = "pack-cache/pack-%s.%s";
	char *cached_pack, *cached_idx;
	int ifd, ofd, ifd_ix = -1;

	cached_pack = git_path(cache, sha1_to_hex(sha1), "pack");
	ifd = open(cached_pack, O_RDONLY);
	if (ifd < 0)
		return 0;

	if (!pack_to_stdout) {
		cached_idx = git_path(cache, sha1_to_hex(sha1), "idx");
		ifd_ix = open(cached_idx, O_RDONLY);
		if (ifd_ix < 0) {
			close(ifd);
			return 0;
		}
	}

994 995 996
	if (progress)
		fprintf(stderr, "Reusing %d objects pack %s\n", nr_objects,
			sha1_to_hex(sha1));
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027

	if (pack_to_stdout) {
		if (copy_fd(ifd, 1))
			exit(1);
		close(ifd);
	}
	else {
		char name[PATH_MAX];
		snprintf(name, sizeof(name),
			 "%s-%s.%s", base_name, sha1_to_hex(sha1), "pack");
		ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
		if (ofd < 0)
			die("unable to open %s (%s)", name, strerror(errno));
		if (copy_fd(ifd, ofd))
			exit(1);
		close(ifd);

		snprintf(name, sizeof(name),
			 "%s-%s.%s", base_name, sha1_to_hex(sha1), "idx");
		ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666);
		if (ofd < 0)
			die("unable to open %s (%s)", name, strerror(errno));
		if (copy_fd(ifd_ix, ofd))
			exit(1);
		close(ifd_ix);
		puts(sha1_to_hex(sha1));
	}

	return 1;
}

1028 1029
int main(int argc, char **argv)
{
1030
	SHA_CTX ctx;
1031
	char line[PATH_MAX + 20];
1032
	int window = 10, depth = 10, pack_to_stdout = 0;
J
Junio C Hamano 已提交
1033
	struct object_entry **list;
1034 1035
	int i;

1036 1037
	setup_git_directory();

1038 1039 1040 1041
	for (i = 1; i < argc; i++) {
		const char *arg = argv[i];

		if (*arg == '-') {
1042 1043 1044 1045
			if (!strcmp("--non-empty", arg)) {
				non_empty = 1;
				continue;
			}
L
Linus Torvalds 已提交
1046 1047 1048 1049
			if (!strcmp("--local", arg)) {
				local = 1;
				continue;
			}
1050 1051 1052 1053
			if (!strcmp("--incremental", arg)) {
				incremental = 1;
				continue;
			}
1054 1055
			if (!strncmp("--window=", arg, 9)) {
				char *end;
1056
				window = strtoul(arg+9, &end, 0);
1057 1058 1059 1060
				if (!arg[9] || *end)
					usage(pack_usage);
				continue;
			}
1061 1062 1063 1064 1065 1066 1067
			if (!strncmp("--depth=", arg, 8)) {
				char *end;
				depth = strtoul(arg+8, &end, 0);
				if (!arg[8] || *end)
					usage(pack_usage);
				continue;
			}
J
Junio C Hamano 已提交
1068 1069 1070 1071
			if (!strcmp("-q", arg)) {
				progress = 0;
				continue;
			}
1072 1073 1074 1075
			if (!strcmp("--no-reuse-delta", arg)) {
				no_reuse_delta = 1;
				continue;
			}
1076 1077 1078 1079
			if (!strcmp("--stdout", arg)) {
				pack_to_stdout = 1;
				continue;
			}
1080 1081 1082 1083 1084 1085 1086
			usage(pack_usage);
		}
		if (base_name)
			usage(pack_usage);
		base_name = arg;
	}

1087
	if (pack_to_stdout != !base_name)
1088 1089
		usage(pack_usage);

L
Linus Torvalds 已提交
1090
	prepare_packed_git();
1091

1092
	if (progress) {
1093 1094 1095 1096 1097 1098
		struct itimerval v;
		v.it_interval.tv_sec = 1;
		v.it_interval.tv_usec = 0;
		v.it_value = v.it_interval;
		signal(SIGALRM, progress_interval);
		setitimer(ITIMER_REAL, &v, NULL);
1099 1100
		fprintf(stderr, "Generating pack...\n");
	}
1101

1102 1103
	while (fgets(line, sizeof(line), stdin) != NULL) {
		unsigned char sha1[20];
1104

1105 1106 1107 1108 1109 1110
		if (line[0] == '-') {
			if (get_sha1_hex(line+1, sha1))
				die("expected edge sha1, got garbage:\n %s",
				    line+1);
			add_preferred_base(sha1);
			continue;
1111
		}
1112
		if (get_sha1_hex(line, sha1))
1113
			die("expected sha1, got garbage:\n %s", line);
1114
		add_object_entry(sha1, name_hash(NULL, line+41), 0);
1115
	}
1116 1117
	if (progress)
		fprintf(stderr, "Done counting %d objects.\n", nr_objects);
1118 1119
	sorted_by_sha = create_final_object_list();
	if (non_empty && !nr_result)
1120
		return 0;
1121

J
Junio C Hamano 已提交
1122 1123
	SHA1_Init(&ctx);
	list = sorted_by_sha;
1124
	for (i = 0; i < nr_result; i++) {
J
Junio C Hamano 已提交
1125 1126 1127 1128
		struct object_entry *entry = *list++;
		SHA1_Update(&ctx, entry->sha1, 20);
	}
	SHA1_Final(object_list_sha1, &ctx);
1129 1130
	if (progress && (nr_objects != nr_result))
		fprintf(stderr, "Result has %d objects.\n", nr_result);
J
Junio C Hamano 已提交
1131

1132 1133 1134
	if (reuse_cached_pack(object_list_sha1, pack_to_stdout))
		;
	else {
1135 1136
		if (nr_result)
			prepare_pack(window, depth);
1137 1138 1139 1140 1141 1142 1143 1144
		if (progress && pack_to_stdout) {
			/* the other end usually displays progress itself */
			struct itimerval v = {{0,},};
			setitimer(ITIMER_REAL, &v, NULL);
			signal(SIGALRM, SIG_IGN );
			progress_update = 0;
		}
		write_pack_file();
1145 1146 1147 1148
		if (!pack_to_stdout) {
			write_index_file();
			puts(sha1_to_hex(object_list_sha1));
		}
1149
	}
1150 1151
	if (progress)
		fprintf(stderr, "Total %d, written %d (delta %d), reused %d (delta %d)\n",
1152
			nr_result, written, written_delta, reused, reused_delta);
1153 1154
	return 0;
}