downloader.go 16.3 KB
Newer Older
1 2 3
package downloader

import (
4
	"errors"
5
	"math/rand"
6 7 8 9 10 11
	"sync"
	"sync/atomic"
	"time"

	"github.com/ethereum/go-ethereum/common"
	"github.com/ethereum/go-ethereum/core/types"
12
	"github.com/ethereum/go-ethereum/event"
13 14 15 16
	"github.com/ethereum/go-ethereum/logger"
	"github.com/ethereum/go-ethereum/logger/glog"
)

17
const (
18 19
	maxHashFetch     = 512              // Amount of hashes to be fetched per chunk
	maxBlockFetch    = 128              // Amount of blocks to be fetched per chunk
20
	peerCountTimeout = 12 * time.Second // Amount of time it takes for the peer handler to ignore minDesiredPeerCount
21
	hashTTL          = 5 * time.Second  // Time it takes for a hash request to time out
22
)
23

24
var (
25 26 27 28
	blockTTL            = 5 * time.Second // Time it takes for a block request to time out
	crossCheckCycle     = time.Second     // Period after which to check for expired cross checks
	minDesiredPeerCount = 5               // Amount of peers desired to start syncing
)
29

30
var (
31 32 33 34 35 36 37 38 39 40 41
	errLowTd            = errors.New("peer's TD is too low")
	ErrBusy             = errors.New("busy")
	errUnknownPeer      = errors.New("peer's unknown or unhealthy")
	ErrBadPeer          = errors.New("action from bad peer ignored")
	errNoPeers          = errors.New("no peers to keep download active")
	ErrPendingQueue     = errors.New("pending items in queue")
	ErrTimeout          = errors.New("timeout")
	errEmptyHashSet     = errors.New("empty hash set by peer")
	errPeersUnavailable = errors.New("no peers available or all peers tried for block download process")
	errAlreadyInPool    = errors.New("hash already in pool")
	ErrInvalidChain     = errors.New("retrieved hash chain is invalid")
42
	ErrCrossCheckFailed = errors.New("block cross-check failed")
43 44 45
	errCancelHashFetch  = errors.New("hash fetching cancelled (requested)")
	errCancelBlockFetch = errors.New("block downloading cancelled (requested)")
	errNoSyncActive     = errors.New("no sync active")
46 47
)

48
type hashCheckFn func(common.Hash) bool
49
type getBlockFn func(common.Hash) *types.Block
50
type chainInsertFn func(types.Blocks) (int, error)
51 52
type hashIterFn func() (common.Hash, error)

O
obscuren 已提交
53 54 55 56 57
type blockPack struct {
	peerId string
	blocks []*types.Block
}

O
obscuren 已提交
58 59 60 61 62
type hashPack struct {
	peerId string
	hashes []common.Hash
}

63
type Downloader struct {
64 65
	mux *event.TypeMux

66 67 68 69
	mu     sync.RWMutex
	queue  *queue                    // Scheduler for selecting the hashes to download
	peers  *peerSet                  // Set of active peers from which download can proceed
	checks map[common.Hash]time.Time // Pending cross checks to verify a hash chain
70

71
	// Callbacks
72 73
	hasBlock hashCheckFn
	getBlock getBlockFn
74

75
	// Status
76
	synchronising int32
77
	notified      int32
78 79 80

	// Channels
	newPeerCh chan *peer
O
obscuren 已提交
81
	hashCh    chan hashPack
82
	blockCh   chan blockPack
83 84 85

	cancelCh   chan struct{} // Channel to cancel mid-flight syncs
	cancelLock sync.RWMutex  // Lock to protect the cancel channel in delivers
86 87
}

88
func New(mux *event.TypeMux, hasBlock hashCheckFn, getBlock getBlockFn) *Downloader {
89
	downloader := &Downloader{
90
		mux:       mux,
91
		queue:     newQueue(),
92
		peers:     newPeerSet(),
93 94 95
		hasBlock:  hasBlock,
		getBlock:  getBlock,
		newPeerCh: make(chan *peer, 1),
O
obscuren 已提交
96
		hashCh:    make(chan hashPack, 1),
97
		blockCh:   make(chan blockPack, 1),
98 99 100 101
	}
	return downloader
}

O
obscuren 已提交
102
func (d *Downloader) Stats() (current int, max int) {
103
	return d.queue.Size()
O
obscuren 已提交
104 105
}

106 107 108 109 110
// Synchronising returns the state of the downloader
func (d *Downloader) Synchronising() bool {
	return atomic.LoadInt32(&d.synchronising) > 0
}

111 112 113 114 115 116 117 118
// RegisterPeer injects a new download peer into the set of block source to be
// used for fetching hashes and blocks from.
func (d *Downloader) RegisterPeer(id string, head common.Hash, getHashes hashFetcherFn, getBlocks blockFetcherFn) error {
	glog.V(logger.Detail).Infoln("Registering peer", id)
	if err := d.peers.Register(newPeer(id, head, getHashes, getBlocks)); err != nil {
		glog.V(logger.Error).Infoln("Register failed:", err)
		return err
	}
119 120 121
	return nil
}

122 123 124 125 126 127 128 129 130
// UnregisterPeer remove a peer from the known list, preventing any action from
// the specified peer.
func (d *Downloader) UnregisterPeer(id string) error {
	glog.V(logger.Detail).Infoln("Unregistering peer", id)
	if err := d.peers.Unregister(id); err != nil {
		glog.V(logger.Error).Infoln("Unregister failed:", err)
		return err
	}
	return nil
131 132
}

133
// Synchronise will select the peer and use it for synchronising. If an empty string is given
134
// it will use the best peer possible and synchronize if it's TD is higher than our own. If any of the
135
// checks fail an error will be returned. This method is synchronous
136
func (d *Downloader) Synchronise(id string, hash common.Hash) error {
137
	// Make sure only one goroutine is ever allowed past this point at once
138 139
	if !atomic.CompareAndSwapInt32(&d.synchronising, 0, 1) {
		return ErrBusy
140
	}
141
	defer atomic.StoreInt32(&d.synchronising, 0)
142

143 144 145 146
	// Post a user notification of the sync (only once per session)
	if atomic.CompareAndSwapInt32(&d.notified, 0, 1) {
		glog.V(logger.Info).Infoln("Block synchronisation started")
	}
147 148 149

	d.mux.Post(StartEvent{})

150 151
	// Create cancel channel for aborting mid-flight
	d.cancelLock.Lock()
152
	d.cancelCh = make(chan struct{})
153
	d.cancelLock.Unlock()
154

155
	// Abort if the queue still contains some leftover data
156
	if _, cached := d.queue.Size(); cached > 0 && d.queue.GetHeadBlock() != nil {
157
		return ErrPendingQueue
158
	}
159
	// Reset the queue and peer set to clean any internal leftover state
160
	d.queue.Reset()
161
	d.peers.Reset()
162
	d.checks = make(map[common.Hash]time.Time)
163

164
	// Retrieve the origin peer and initiate the downloading process
165
	p := d.peers.Peer(id)
166
	if p == nil {
167
		return errUnknownPeer
168
	}
169
	return d.syncWithPeer(p, hash)
170 171
}

172
// TakeBlocks takes blocks from the queue and yields them to the caller.
173
func (d *Downloader) TakeBlocks() types.Blocks {
174
	return d.queue.TakeBlocks()
175 176
}

177
func (d *Downloader) Has(hash common.Hash) bool {
178
	return d.queue.Has(hash)
179 180
}

181 182 183
// syncWithPeer starts a block synchronization based on the hash chain from the
// specified peer and head hash.
func (d *Downloader) syncWithPeer(p *peer, hash common.Hash) (err error) {
184 185 186
	defer func() {
		// reset on error
		if err != nil {
187
			d.Cancel()
188 189 190
			d.mux.Post(FailedEvent{err})
		} else {
			d.mux.Post(DoneEvent{})
191 192
		}
	}()
193

194
	glog.V(logger.Debug).Infoln("Synchronizing with the network using:", p.id)
195
	if err = d.fetchHashes(p, hash); err != nil {
196 197
		return err
	}
198
	if err = d.fetchBlocks(); err != nil {
199
		return err
200
	}
201
	glog.V(logger.Debug).Infoln("Synchronization completed")
202 203

	return nil
204 205
}

206 207 208 209
// Cancel cancels all of the operations and resets the queue. It returns true
// if the cancel operation was completed.
func (d *Downloader) Cancel() bool {
	// If we're not syncing just return.
210
	hs, bs := d.queue.Size()
211 212 213
	if atomic.LoadInt32(&d.synchronising) == 0 && hs == 0 && bs == 0 {
		return false
	}
214
	// Close the current cancel channel
215 216 217 218 219 220 221 222
	d.cancelLock.Lock()
	select {
	case <-d.cancelCh:
		// Channel was already closed
	default:
		close(d.cancelCh)
	}
	d.cancelLock.Unlock()
223 224 225 226 227 228 229

	// reset the queue
	d.queue.Reset()

	return true
}

230
// XXX Make synchronous
231
func (d *Downloader) fetchHashes(p *peer, h common.Hash) error {
O
obscuren 已提交
232
	glog.V(logger.Debug).Infof("Downloading hashes (%x) from %s", h[:4], p.id)
233 234 235

	start := time.Now()

236
	// Add the hash to the queue first, and start hash retrieval
237
	d.queue.Insert([]common.Hash{h})
O
obscuren 已提交
238
	p.getHashes(h)
239

O
obscuren 已提交
240
	var (
241 242 243 244 245 246
		active = p             // active peer will help determine the current active peer
		head   = common.Hash{} // common and last hash

		timeout     = time.NewTimer(hashTTL)          // timer to dump a non-responsive active peer
		attempted   = make(map[string]bool)           // attempted peers will help with retries
		crossTicker = time.NewTicker(crossCheckCycle) // ticker to periodically check expired cross checks
O
obscuren 已提交
247
	)
248
	defer crossTicker.Stop()
249

250 251
	attempted[p.id] = true
	for finished := false; !finished; {
252
		select {
253 254
		case <-d.cancelCh:
			return errCancelHashFetch
255

O
obscuren 已提交
256
		case hashPack := <-d.hashCh:
257
			// Make sure the active peer is giving us the hashes
258
			if hashPack.peerId != active.id {
O
obscuren 已提交
259 260 261
				glog.V(logger.Debug).Infof("Received hashes from incorrect peer(%s)\n", hashPack.peerId)
				break
			}
262
			timeout.Reset(hashTTL)
O
obscuren 已提交
263

264 265
			// Make sure the peer actually gave something valid
			if len(hashPack.hashes) == 0 {
266
				glog.V(logger.Debug).Infof("Peer (%s) responded with empty hash set\n", active.id)
267 268 269 270
				return errEmptyHashSet
			}
			// Determine if we're done fetching hashes (queue up all pending), and continue if not done
			done, index := false, 0
271 272 273
			for index, head = range hashPack.hashes {
				if d.hasBlock(head) || d.queue.GetBlock(head) != nil {
					glog.V(logger.Debug).Infof("Found common hash %x\n", head[:4])
274
					hashPack.hashes = hashPack.hashes[:index]
275 276 277 278
					done = true
					break
				}
			}
279 280
			// Insert all the new hashes, but only continue if got something useful
			inserts := d.queue.Insert(hashPack.hashes)
281 282
			if len(inserts) == 0 && !done {
				glog.V(logger.Debug).Infof("Peer (%s) responded with stale hashes\n", active.id)
283
				return ErrBadPeer
284 285
			}
			if !done {
286 287 288 289
				// Try and fetch a random block to verify the hash batch
				cross := inserts[rand.Intn(len(inserts))]
				glog.V(logger.Detail).Infof("Cross checking (%s) with %x", active.id, cross)

290
				d.checks[cross] = time.Now().Add(blockTTL)
291 292 293 294
				active.getBlocks([]common.Hash{cross})

				// Also fetch a fresh
				active.getHashes(head)
295
				continue
296
			}
297 298
			// We're done, allocate the download cache and proceed pulling the blocks
			offset := 0
299
			if block := d.getBlock(head); block != nil {
300 301 302
				offset = int(block.NumberU64() + 1)
			}
			d.queue.Alloc(offset)
303 304 305 306 307 308 309
			finished = true

		case blockPack := <-d.blockCh:
			// Cross check the block with the random verifications
			if blockPack.peerId != active.id || len(blockPack.blocks) != 1 {
				continue
			}
310 311 312 313 314 315 316
			block := blockPack.blocks[0]
			if _, ok := d.checks[block.Hash()]; ok {
				if !d.queue.Has(block.ParentHash()) {
					return ErrCrossCheckFailed
				}
				delete(d.checks, block.Hash())
			}
317 318 319

		case <-crossTicker.C:
			// Iterate over all the cross checks and fail the hash chain if they're not verified
320
			for hash, deadline := range d.checks {
321 322 323 324 325
				if time.Now().After(deadline) {
					glog.V(logger.Debug).Infof("Cross check timeout for %x", hash)
					return ErrCrossCheckFailed
				}
			}
326

327
		case <-timeout.C:
328 329
			glog.V(logger.Debug).Infof("Peer (%s) didn't respond in time for hash request\n", p.id)

O
obscuren 已提交
330 331 332 333
			var p *peer // p will be set if a peer can be found
			// Attempt to find a new peer by checking inclusion of peers best hash in our
			// already fetched hash list. This can't guarantee 100% correctness but does
			// a fair job. This is always either correct or false incorrect.
334
			for _, peer := range d.peers.AllPeers() {
335
				if d.queue.Has(peer.head) && !attempted[peer.id] {
O
obscuren 已提交
336 337 338 339 340 341
					p = peer
					break
				}
			}
			// if all peers have been tried, abort the process entirely or if the hash is
			// the zero hash.
342
			if p == nil || (head == common.Hash{}) {
343
				return ErrTimeout
O
obscuren 已提交
344 345 346
			}
			// set p to the active peer. this will invalidate any hashes that may be returned
			// by our previous (delayed) peer.
347 348
			active = p
			p.getHashes(head)
O
obscuren 已提交
349
			glog.V(logger.Debug).Infof("Hash fetching switched to new peer(%s)\n", p.id)
350 351
		}
	}
352
	glog.V(logger.Debug).Infof("Downloaded hashes (%d) in %v\n", d.queue.Pending(), time.Since(start))
353 354 355 356

	return nil
}

357 358 359 360
// fetchBlocks iteratively downloads the entire schedules block-chain, taking
// any available peers, reserving a chunk of blocks for each, wait for delivery
// and periodically checking for timeouts.
func (d *Downloader) fetchBlocks() error {
361
	glog.V(logger.Debug).Infoln("Downloading", d.queue.Pending(), "block(s)")
362 363
	start := time.Now()

364
	// default ticker for re-fetching blocks every now and then
365 366 367 368
	ticker := time.NewTicker(20 * time.Millisecond)
out:
	for {
		select {
369 370
		case <-d.cancelCh:
			return errCancelBlockFetch
371

372
		case blockPack := <-d.blockCh:
373 374 375 376 377 378 379 380
			// Short circuit if it's a stale cross check
			if len(blockPack.blocks) == 1 {
				block := blockPack.blocks[0]
				if _, ok := d.checks[block.Hash()]; ok {
					delete(d.checks, block.Hash())
					continue
				}
			}
381 382
			// If the peer was previously banned and failed to deliver it's pack
			// in a reasonable time frame, ignore it's message.
383
			if peer := d.peers.Peer(blockPack.peerId); peer != nil {
384
				// Deliver the received chunk of blocks
385
				if err := d.queue.Deliver(blockPack.peerId, blockPack.blocks); err != nil {
386 387 388 389 390
					if err == ErrInvalidChain {
						// The hash chain is invalid (blocks are not ordered properly), abort
						return err
					}
					// Peer did deliver, but some blocks were off, penalize
391
					glog.V(logger.Debug).Infof("Failed delivery for peer %s: %v\n", blockPack.peerId, err)
392
					peer.Demote()
393 394 395
					break
				}
				if glog.V(logger.Debug) {
396
					glog.Infof("Added %d blocks from: %s\n", len(blockPack.blocks), blockPack.peerId)
397
				}
398 399 400
				// Promote the peer and update it's idle state
				peer.Promote()
				peer.SetIdle()
401
			}
402
		case <-ticker.C:
403 404 405 406 407
			// Check for bad peers. Bad peers may indicate a peer not responding
			// to a `getBlocks` message. A timeout of 5 seconds is set. Peers
			// that badly or poorly behave are removed from the peer set (not banned).
			// Bad peers are excluded from the available peer set and therefor won't be
			// reused. XXX We could re-introduce peers after X time.
408
			badPeers := d.queue.Expire(blockTTL)
409 410 411 412 413 414
			for _, pid := range badPeers {
				// XXX We could make use of a reputation system here ranking peers
				// in their performance
				// 1) Time for them to respond;
				// 2) Measure their speed;
				// 3) Amount and availability.
415 416 417
				if peer := d.peers.Peer(pid); peer != nil {
					peer.Demote()
				}
418 419
			}
			// After removing bad peers make sure we actually have sufficient peer left to keep downloading
420
			if d.peers.Len() == 0 {
421 422
				return errNoPeers
			}
423 424
			// If there are unrequested hashes left start fetching
			// from the available peers.
425 426 427 428 429
			if d.queue.Pending() > 0 {
				// Throttle the download if block cache is full and waiting processing
				if d.queue.Throttle() {
					continue
				}
430
				// Send a download request to all idle peers, until throttled
431 432
				idlePeers := d.peers.IdlePeers()
				for _, peer := range idlePeers {
433 434 435 436
					// Short circuit if throttling activated since above
					if d.queue.Throttle() {
						break
					}
437 438
					// Get a possible chunk. If nil is returned no chunk
					// could be returned due to no hashes available.
439 440
					request := d.queue.Reserve(peer, maxBlockFetch)
					if request == nil {
441 442 443 444 445
						continue
					}
					// Fetch the chunk and check for error. If the peer was somehow
					// already fetching a chunk due to a bug, it will be returned to
					// the queue
446 447
					if err := peer.Fetch(request); err != nil {
						glog.V(logger.Error).Infof("Peer %s received double work\n", peer.id)
448
						d.queue.Cancel(request)
449 450
					}
				}
451
				// Make sure that we have peers available for fetching. If all peers have been tried
452
				// and all failed throw an error
453
				if d.queue.InFlight() == 0 {
454
					return errPeersUnavailable
455 456
				}

457 458
			} else if d.queue.InFlight() == 0 {
				// When there are no more queue and no more in flight, We can
459 460 461 462 463 464
				// safely assume we're done. Another part of the process will  check
				// for parent errors and will re-request anything that's missing
				break out
			}
		}
	}
465 466 467 468 469
	glog.V(logger.Detail).Infoln("Downloaded block(s) in", time.Since(start))

	return nil
}

470 471 472
// DeliverBlocks injects a new batch of blocks received from a remote node.
// This is usually invoked through the BlocksMsg by the protocol handler.
func (d *Downloader) DeliverBlocks(id string, blocks []*types.Block) error {
473 474 475 476
	// Make sure the downloader is active
	if atomic.LoadInt32(&d.synchronising) == 0 {
		return errNoSyncActive
	}
477 478 479 480
	// Deliver or abort if the sync is canceled while queuing
	d.cancelLock.RLock()
	cancel := d.cancelCh
	d.cancelLock.RUnlock()
481

482 483 484 485 486 487 488
	select {
	case d.blockCh <- blockPack{id, blocks}:
		return nil

	case <-cancel:
		return errNoSyncActive
	}
O
moved  
obscuren 已提交
489 490
}

491 492 493 494
// DeliverHashes injects a new batch of hashes received from a remote node into
// the download schedule. This is usually invoked through the BlockHashesMsg by
// the protocol handler.
func (d *Downloader) DeliverHashes(id string, hashes []common.Hash) error {
495 496 497 498
	// Make sure the downloader is active
	if atomic.LoadInt32(&d.synchronising) == 0 {
		return errNoSyncActive
	}
499 500 501 502
	// Deliver or abort if the sync is canceled while queuing
	d.cancelLock.RLock()
	cancel := d.cancelCh
	d.cancelLock.RUnlock()
503

504 505 506 507 508 509 510
	select {
	case d.hashCh <- hashPack{id, hashes}:
		return nil

	case <-cancel:
		return errNoSyncActive
	}
511
}