pack-objects.c 9.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
#include "cache.h"
#include "object.h"
#include "delta.h"

static const char pack_usage[] = "git-pack-objects [--window=N] base-name < object-list";

enum object_type {
	OBJ_NONE,
	OBJ_COMMIT,
	OBJ_TREE,
	OBJ_BLOB,
	OBJ_DELTA	// NOTE! This is _not_ the same as a "delta" object in the filesystem
};

struct object_entry {
	unsigned char sha1[20];
	unsigned long size;
	unsigned long offset;
	unsigned int depth;
	unsigned int flags;
	enum object_type type;
	unsigned long delta_size;
	struct object_entry *delta;
};

static struct object_entry **sorted_by_sha, **sorted_by_type;
static struct object_entry *objects = NULL;
static int nr_objects = 0, nr_alloc = 0;
static int delta_window = 10;
static const char *base_name;

struct myfile {
	int fd;
	unsigned long chars;
	unsigned char buffer[8192];
};

static FILE *create_file(const char *suffix)
{
	static char filename[PATH_MAX];
	unsigned len;

	len = snprintf(filename, PATH_MAX, "%s.%s", base_name, suffix);
	if (len >= PATH_MAX)
		die("you wascally wabbit, you");
	return fopen(filename, "w");
}

static unsigned long fwrite_compressed(void *in, unsigned long size, FILE *f)
{
	z_stream stream;
	unsigned long maxsize;
	void *out;

	memset(&stream, 0, sizeof(stream));
	deflateInit(&stream, Z_DEFAULT_COMPRESSION);
	maxsize = deflateBound(&stream, size);
	out = xmalloc(maxsize);

	/* Compress it */
	stream.next_in = in;
	stream.avail_in = size;

	stream.next_out = out;
	stream.avail_out = maxsize;

	while (deflate(&stream, Z_FINISH) == Z_OK)
		/* nothing */;
	deflateEnd(&stream);

	size = stream.total_out;
	fwrite(out, size, 1, f);
	free(out);
	return size;
}

static void *delta_against(void *buf, unsigned long size, struct object_entry *entry)
{
	unsigned long othersize, delta_size;
	char type[10];
	void *otherbuf = read_sha1_file(entry->delta->sha1, type, &othersize);
	void *delta_buf;

	if (!otherbuf)
		die("unable to read %s", sha1_to_hex(entry->delta->sha1));
86 87
        delta_buf = diff_delta(buf, size, otherbuf, othersize, &delta_size, ~0UL);
        if (!delta_buf || delta_size != entry->delta_size)
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
        	die("delta size changed");
        free(buf);
        free(otherbuf);
	return delta_buf;
}

static unsigned long write_object(FILE *f, struct object_entry *entry)
{
	unsigned long size;
	char type[10];
	void *buf = read_sha1_file(entry->sha1, type, &size);
	char header[21];
	unsigned hdrlen, datalen;

	if (!buf)
		die("unable to read %s", sha1_to_hex(entry->sha1));
	if (size != entry->size)
		die("object %s size inconsistency (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);

	/*
	 * The object header is a byte of 'type' followed by four bytes of
	 * length, except for deltas that has the 20 bytes of delta sha
	 * instead.
	 */
	header[0] = ".CTB"[entry->type];
	datalen = htonl(size);
	memcpy(header+1, &datalen, 4);
	hdrlen = 5;
	if (entry->delta) {
117
		header[0] = 'D';
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
		memcpy(header+1, entry->delta, 20);
		buf = delta_against(buf, size, entry);
		size = entry->delta_size;
		hdrlen = 21;
	}
	fwrite(header, hdrlen, 1, f);
	datalen = fwrite_compressed(buf, size, f);
	free(buf);
	return hdrlen + datalen;
}

static void write_pack_file(void)
{
	int i;
	FILE *f = create_file("pack");
	unsigned long offset = 0;
	unsigned long mb;

	for (i = 0; i < nr_objects; i++) {
		struct object_entry *entry = objects + i;
		entry->offset = offset;
		offset += write_object(f, entry);
	}
	fclose(f);
	mb = offset >> 20;
	offset &= 0xfffff;
}

static void write_index_file(void)
{
	int i;
	FILE *f = create_file("idx");
	struct object_entry **list = sorted_by_sha;
	struct object_entry **last = list + nr_objects;
	unsigned int array[256];

	/*
	 * Write the first-level table (the list is sorted,
	 * but we use a 256-entry lookup to be able to avoid
	 * having to do eight extra binary search iterations)
	 */
	for (i = 0; i < 256; i++) {
		struct object_entry **next = list;
		while (next < last) {
			struct object_entry *entry = *next;
			if (entry->sha1[0] != i)
				break;
			next++;
		}
		array[i] = htonl(next - sorted_by_sha);
		list = next;
	}
	fwrite(array, 256, sizeof(int), f);

	/*
	 * Write the actual SHA1 entries..
	 */
	list = sorted_by_sha;
L
Linus Torvalds 已提交
176
	for (i = 0; i < nr_objects; i++) {
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
		struct object_entry *entry = *list++;
		unsigned int offset = htonl(entry->offset);
		fwrite(&offset, 4, 1, f);
		fwrite(entry->sha1, 20, 1, f);
	}
	fclose(f);
}

static void add_object_entry(unsigned char *sha1)
{
	unsigned int idx = nr_objects;
	struct object_entry *entry;

	if (idx >= nr_alloc) {
		unsigned int needed = (idx + 1024) * 3 / 2;
		objects = xrealloc(objects, needed * sizeof(*entry));
		nr_alloc = needed;
	}
	entry = objects + idx;
	memset(entry, 0, sizeof(*entry));
	memcpy(entry->sha1, sha1, 20);
	nr_objects = idx+1;
}

static void check_object(struct object_entry *entry)
{
	char buffer[128];
	char type[10];
	unsigned long mapsize;
	z_stream stream;
	void *map;

	map = map_sha1_file(entry->sha1, &mapsize);
	if (!map)
		die("unable to map %s", sha1_to_hex(entry->sha1));
	if (unpack_sha1_header(&stream, map, mapsize, buffer, sizeof(buffer)) < 0)
		die("unable to unpack %s header", sha1_to_hex(entry->sha1));
	munmap(map, mapsize);
	if (parse_sha1_header(buffer, type, &entry->size) < 0)
		die("unable to parse %s header", sha1_to_hex(entry->sha1));
	if (!strcmp(type, "commit")) {
		entry->type = OBJ_COMMIT;
	} else if (!strcmp(type, "tree")) {
		entry->type = OBJ_TREE;
	} else if (!strcmp(type, "blob")) {
		entry->type = OBJ_BLOB;
	} else
		die("unable to pack object %s of type %s", sha1_to_hex(entry->sha1), type);
}

static void get_object_details(void)
{
	int i;
	struct object_entry *entry = objects;

	for (i = 0; i < nr_objects; i++)
		check_object(entry++);
}

typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *);

static entry_sort_t current_sort;

static int sort_comparator(const void *_a, const void *_b)
{
	struct object_entry *a = *(struct object_entry **)_a;
	struct object_entry *b = *(struct object_entry **)_b;
	return current_sort(a,b);
}

static struct object_entry **create_sorted_list(entry_sort_t sort)
{
	struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *));
	int i;

	for (i = 0; i < nr_objects; i++)
		list[i] = objects + i;
	current_sort = sort;
	qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator);
	return list;
}

static int sha1_sort(const struct object_entry *a, const struct object_entry *b)
{
	return memcmp(a->sha1, b->sha1, 20);
}

static int type_size_sort(const struct object_entry *a, const struct object_entry *b)
{
	if (a->type < b->type)
		return -1;
	if (a->type > b->type)
		return 1;
	if (a->size < b->size)
		return -1;
	if (a->size > b->size)
		return 1;
	return a < b ? -1 : (a > b);
}

struct unpacked {
	struct object_entry *entry;
	void *data;
};

/*
 * We search for deltas in a list sorted by type and by size, and
 * walk the "old" chain backwards in the list, so if the type
 * has changed or the size difference is too big, there's no point
 * in even continuing the walk, since the other old objects are
 * going to be even smaller or of a different type. So return -1
 * once we determine that there's no point even trying.
 */
static int try_delta(struct unpacked *cur, struct unpacked *old)
{
	struct object_entry *cur_entry = cur->entry;
	struct object_entry *old_entry = old->entry;
	unsigned long size, oldsize, delta_size;
295
	long max_size;
296 297 298 299 300 301 302 303
	void *delta_buf;

	/* Don't bother doing diffs between different types */
	if (cur_entry->type != old_entry->type)
		return -1;

	/* Size is guaranteed to be larger than or equal to oldsize */
	size = cur_entry->size;
304 305
	if (size < 50)
		return -1;
306 307 308 309 310 311 312 313 314 315 316
	oldsize = old_entry->size;
	if (size - oldsize > oldsize / 4)
		return -1;

	/*
	 * NOTE!
	 *
	 * We always delta from the bigger to the smaller, since that's
	 * more space-efficient (deletes don't have to say _what_ they
	 * delete).
	 */
317 318 319 320
	max_size = size / 2 - 20;
	if (cur_entry->delta)
		max_size = cur_entry->delta_size-1;
	delta_buf = diff_delta(cur->data, size, old->data, oldsize, &delta_size, max_size);
321
	if (!delta_buf)
322 323 324
		return 0;
	cur_entry->delta = old_entry;
	cur_entry->delta_size = delta_size;
325
	free(delta_buf);
326
	return 0;
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
}

static void find_deltas(struct object_entry **list, int window)
{
	unsigned int i;
	unsigned int array_size = window * sizeof(struct unpacked);
	struct unpacked *array = xmalloc(array_size);

	memset(array, 0, array_size);
	for (i = 0; i < nr_objects; i++) {
		unsigned int idx = i % window;
		struct object_entry *entry = list[i];
		struct unpacked *n = array + idx;
		unsigned long size;
		char type[10];
		int j;

		free(n->data);
		n->entry = entry;
		n->data = read_sha1_file(entry->sha1, type, &size);
		if (size != entry->size)
			die("object %s inconsistent object length (%lu vs %lu)", sha1_to_hex(entry->sha1), size, entry->size);
L
Linus Torvalds 已提交
349 350 351
		j = window;
		while (--j > 0) {
			unsigned int other_idx = idx + j;
352
			struct unpacked *m;
L
Linus Torvalds 已提交
353 354
			if (other_idx >= window)
				other_idx -= window;
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
			m = array + other_idx;
			if (!m->entry)
				break;
			if (try_delta(n, m) < 0)
				break;
		}
	}
}

int main(int argc, char **argv)
{
	char line[128];
	int i;

	for (i = 1; i < argc; i++) {
		const char *arg = argv[i];

		if (*arg == '-') {
			if (!strncmp("--window=", arg, 9)) {
				char *end;
				delta_window = strtoul(arg+9, &end, 0);
				if (!arg[9] || *end)
					usage(pack_usage);
				continue;
			}
			usage(pack_usage);
		}
		if (base_name)
			usage(pack_usage);
		base_name = arg;
	}

	if (!base_name)
		usage(pack_usage);

	while (fgets(line, sizeof(line), stdin) != NULL) {
		unsigned char sha1[20];
		if (get_sha1_hex(line, sha1))
			die("expected sha1, got garbage");
		add_object_entry(sha1);
	}
	get_object_details();

	printf("Packing %d objects\n", nr_objects);

	sorted_by_sha = create_sorted_list(sha1_sort);
	sorted_by_type = create_sorted_list(type_size_sort);
	if (delta_window)
		find_deltas(sorted_by_type, delta_window);

	write_pack_file();
	write_index_file();
	return 0;
}