downloader: added missing blocks catchup functionality

When a parent is missing in the block list an attempt should be made to fetch the missing parent and grandparents.

downloader: added missing blocks catchup functionality
When a parent is missing in the block list an attempt should be made to fetch the missing parent and grandparents.
a8a2b2a4 · obscuren · 7dcb9825 · a8a2b2a4 · a8a2b2a4 · a8a2b2a4
4 changed file
--- a/eth/downloader/downloader.go
+++ b/eth/downloader/downloader.go
@@ -8,63 +8,44 @@ import (
 	"time"

 	"github.com/ethereum/go-ethereum/common"
+	"github.com/ethereum/go-ethereum/core"
 	"github.com/ethereum/go-ethereum/core/types"
 	"github.com/ethereum/go-ethereum/logger"
 	"github.com/ethereum/go-ethereum/logger/glog"
 	"gopkg.in/fatih/set.v0"
 )

-const maxBlockFetch = 256
+const (
+	maxBlockFetch       = 256 // Amount of max blocks to be fetched per chunk
+	minDesiredPeerCount = 3   // Amount of peers desired to start syncing
+)

-type hashFetcherFn func(common.Hash) error
-type blockFetcherFn func([]common.Hash) error
 type hashCheckFn func(common.Hash) bool
 type chainInsertFn func(types.Blocks) error
 type hashIterFn func() (common.Hash, error)
-
-// XXX make threadsafe!!!!
-type peers map[string]*peer
-
-func (p peers) get(state int) []*peer {
-	var peers []*peer
-	for _, peer := range p {
-		peer.mu.RLock()
-		if peer.state == state {
-			peers = append(peers, peer)
-		}
-		peer.mu.RUnlock()
-	}
-
-	return peers
-}
-
-func (p peers) setState(id string, state int) {
-	if peer, exist := p[id]; exist {
-		peer.mu.Lock()
-		defer peer.mu.Unlock()
-		peer.state = state
-	}
-}
+type currentTdFn func() *big.Int

 type Downloader struct {
+	mu    sync.RWMutex
 	queue *queue
+	peers peers

+	// Callbacks
 	hasBlock    hashCheckFn
 	insertChain chainInsertFn
+	currentTd   currentTdFn

-	mu    sync.RWMutex
-	peers peers
-
-	currentPeer *peer
-
+	// Status
 	fetchingHashes    int32
 	downloadingBlocks int32
-
-	newPeerCh    chan *peer
-	selectPeerCh chan *peer
-	HashCh       chan []common.Hash
-	blockCh      chan blockPack
-	quit         chan struct{}
+	processingBlocks  int32
+
+	// Channels
+	newPeerCh chan *peer
+	syncCh    chan syncPack
+	HashCh    chan []common.Hash
+	blockCh   chan blockPack
+	quit      chan struct{}
 }

 type blockPack struct {
@@ -72,17 +53,23 @@ type blockPack struct {
 	blocks []*types.Block
 }

-func New(hasBlock hashCheckFn, insertChain chainInsertFn) *Downloader {
+type syncPack struct {
+	peer *peer
+	hash common.Hash
+}
+
+func New(hasBlock hashCheckFn, insertChain chainInsertFn, currentTd currentTdFn) *Downloader {
 	downloader := &Downloader{
-		queue:        newqueue(),
-		peers:        make(peers),
-		hasBlock:     hasBlock,
-		insertChain:  insertChain,
-		newPeerCh:    make(chan *peer, 1),
-		selectPeerCh: make(chan *peer, 1),
-		HashCh:       make(chan []common.Hash, 1),
-		blockCh:      make(chan blockPack, 1),
-		quit:         make(chan struct{}),
+		queue:       newqueue(),
+		peers:       make(peers),
+		hasBlock:    hasBlock,
+		insertChain: insertChain,
+		currentTd:   currentTd,
+		newPeerCh:   make(chan *peer, 1),
+		syncCh:      make(chan syncPack, 1),
+		HashCh:      make(chan []common.Hash, 1),
+		blockCh:     make(chan blockPack, 1),
+		quit:        make(chan struct{}),
 	}
 	go downloader.peerHandler()
 	go downloader.update()
@@ -116,89 +103,88 @@ func (d *Downloader) UnregisterPeer(id string) {
 }

 func (d *Downloader) peerHandler() {
-	// Fields defined here so we can reduce the amount of locking
-	// that needs to be done
-	var highestTd = new(big.Int)
+	// itimer is used to determine when to start ignoring `minDesiredPeerCount`
+	itimer := time.NewTicker(5 * time.Second)
 out:
 	for {
 		select {
-		case newPeer := <-d.newPeerCh:
-			// Check if TD of peer is higher than our current
-			if newPeer.td.Cmp(highestTd) > 0 {
-				glog.V(logger.Detail).Infoln("New peer with highest TD =", newPeer.td)
-
-				highestTd.Set(newPeer.td)
-				// select the peer for downloading
-				d.selectPeerCh <- newPeer
+		case <-d.newPeerCh:
+			// Meet the `minDesiredPeerCount` before we select our best peer
+			if len(d.peers) < minDesiredPeerCount {
+				break
 			}
+			d.selectPeer(d.peers.bestPeer())
+		case <-itimer.C:
+			// The timer will make sure that the downloader keeps an active state
+			// in which it attempts to always check the network for highest td peers
+			d.selectPeer(d.peers.bestPeer())
 		case <-d.quit:
 			break out
 		}
 	}
 }

+func (d *Downloader) selectPeer(p *peer) {
+	// Make sure it's doing neither. Once done we can restart the
+	// downloading process if the TD is higher. For now just get on
+	// with whatever is going on. This prevents unecessary switching.
+	if !(d.isFetchingHashes() || d.isDownloadingBlocks() || d.isProcessing()) {
+		// selected peer must be better than our own
+		// XXX we also check the peer's recent hash to make sure we
+		// don't have it. Some peers report (i think) incorrect TD.
+		if p.td.Cmp(d.currentTd()) <= 0 || d.hasBlock(p.recentHash) {
+			return
+		}
+
+		glog.V(logger.Detail).Infoln("New peer with highest TD =", p.td)
+		d.syncCh <- syncPack{p, p.recentHash}
+	}
+}
+
 func (d *Downloader) update() {
 out:
 	for {
 		select {
-		case selectedPeer := <-d.selectPeerCh:
-			// Make sure it's doing neither. Once done we can restart the
-			// downloading process if the TD is higher. For now just get on
-			// with whatever is going on. This prevents unecessary switching.
-			if !(d.isFetchingHashes() || d.isDownloadingBlocks()) {
-				glog.V(logger.Detail).Infoln("Selected new peer", selectedPeer.id)
-				// Start the fetcher. This will block the update entirely
-				// interupts need to be send to the appropriate channels
-				// respectively.
-				if err := d.startFetchingHashes(selectedPeer); err != nil {
-					// handle error
-					glog.V(logger.Debug).Infoln("Error fetching hashes:", err)
-					// Reset
-					break
-				}
-
-				// Start fetching blocks in paralel. The strategy is simple
-				// take any available peers, seserve a chunk for each peer available,
-				// let the peer deliver the chunkn and periodically check if a peer
-				// has timedout. When done downloading, process blocks.
-				if err := d.startFetchingBlocks(selectedPeer); err != nil {
-					glog.V(logger.Debug).Infoln("Error downloading blocks:", err)
-					// reset
-					break
-				}
+		case sync := <-d.syncCh:
+			selectedPeer := sync.peer
+			glog.V(logger.Detail).Infoln("Synchronising with network using:", selectedPeer.id)
+			// Start the fetcher. This will block the update entirely
+			// interupts need to be send to the appropriate channels
+			// respectively.
+			if err := d.startFetchingHashes(selectedPeer, sync.hash); err != nil {
+				// handle error
+				glog.V(logger.Debug).Infoln("Error fetching hashes:", err)
+				// XXX Reset
+				break
+			}

-				// XXX this will move when optimised
-				// Sort the blocks by number. This bit needs much improvement. Right now
-				// it assumes full honesty form peers (i.e. it's not checked when the blocks
-				// link). We should at least check whihc queue match. This code could move
-				// to a seperate goroutine where it periodically checks for linked pieces.
-				types.BlockBy(types.Number).Sort(d.queue.blocks)
-				blocks := d.queue.blocks
-
-				glog.V(logger.Debug).Infoln("Inserting chain with", len(blocks), "blocks")
-				// Loop untill we're out of queue
-				for len(blocks) != 0 {
-					max := int(math.Min(float64(len(blocks)), 256))
-					// TODO check for parent error. When there's a parent error we should stop
-					// processing and start requesting the `block.hash` so that it's parent and
-					// grandparents can be requested and queued.
-					d.insertChain(blocks[:max])
-					blocks = blocks[max:]
-				}
+			// Start fetching blocks in paralel. The strategy is simple
+			// take any available peers, seserve a chunk for each peer available,
+			// let the peer deliver the chunkn and periodically check if a peer
+			// has timedout. When done downloading, process blocks.
+			if err := d.startFetchingBlocks(selectedPeer); err != nil {
+				glog.V(logger.Debug).Infoln("Error downloading blocks:", err)
+				// XXX reset
+				break
 			}
+
+			glog.V(logger.Detail).Infoln("Sync completed")
+
+			d.process()
 		case <-d.quit:
 			break out
 		}
 	}
 }

-func (d *Downloader) startFetchingHashes(p *peer) error {
+// XXX Make synchronous
+func (d *Downloader) startFetchingHashes(p *peer, hash common.Hash) error {
 	glog.V(logger.Debug).Infoln("Downloading hashes")

 	start := time.Now()

 	// Get the first batch of hashes
-	p.getHashes(p.recentHash)
+	p.getHashes(hash)
 	atomic.StoreInt32(&d.fetchingHashes, 1)

 out:
@@ -237,10 +223,6 @@ out:
 	return nil
 }

-func (d *Downloader) DeliverBlocks(id string, block []*types.Block) {
-	d.blockCh <- blockPack{id, block}
-}
-
 func (d *Downloader) startFetchingBlocks(p *peer) error {
 	glog.V(logger.Detail).Infoln("Downloading", d.queue.hashPool.Size(), "blocks")
 	atomic.StoreInt32(&d.downloadingBlocks, 1)
@@ -253,8 +235,6 @@ out:
 	for {
 		select {
 		case blockPack := <-d.blockCh:
-			//fmt.Println("get for", blockPack.peerId)
-
 			d.queue.deliver(blockPack.peerId, blockPack.blocks)
 			d.peers.setState(blockPack.peerId, idleState)
 		case <-ticker.C:
@@ -266,21 +246,24 @@ out:
 					// Get a possible chunk. If nil is returned no chunk
 					// could be returned due to no hashes available.
 					chunk := d.queue.get(peer, maxBlockFetch)
-					if chunk != nil {
-						//fmt.Println("fetching for", peer.id)
-						// Fetch the chunk and check for error. If the peer was somehow
-						// already fetching a chunk due to a bug, it will be returned to
-						// the queue
-						if err := peer.fetch(chunk); err != nil {
-							// log for tracing
-							glog.V(logger.Debug).Infof("peer %s received double work (state = %v)\n", peer.id, peer.state)
-							d.queue.put(chunk.hashes)
-						}
+					if chunk == nil {
+						continue
+					}
+
+					//fmt.Println("fetching for", peer.id)
+					// XXX make fetch blocking.
+					// Fetch the chunk and check for error. If the peer was somehow
+					// already fetching a chunk due to a bug, it will be returned to
+					// the queue
+					if err := peer.fetch(chunk); err != nil {
+						// log for tracing
+						glog.V(logger.Debug).Infof("peer %s received double work (state = %v)\n", peer.id, peer.state)
+						d.queue.put(chunk.hashes)
 					}
 				}
 				atomic.StoreInt32(&d.downloadingBlocks, 1)
 			} else if len(d.queue.fetching) == 0 {
-				// Whene there are no more queue and no more `fetching`. We can
+				// When there are no more queue and no more `fetching`. We can
 				// safely assume we're done. Another part of the process will  check
 				// for parent errors and will re-request anything that's missing
 				atomic.StoreInt32(&d.downloadingBlocks, 0)
@@ -325,6 +308,88 @@ out:
 	return nil
 }

+// Add an (unrequested) block to the downloader. This is usually done through the
+// NewBlockMsg by the protocol handler.
+func (d *Downloader) AddBlock(id string, block *types.Block, td *big.Int) {
+	hash := block.Hash()
+
+	if d.hasBlock(hash) {
+		return
+	}
+
+	glog.V(logger.Detail).Infoln("Inserting new block from:", id)
+	d.queue.addBlock(id, block, td)
+
+	// if the peer is in our healthy list of peers; update the td
+	// here is a good chance to add the peer back to the list
+	if peer := d.peers.getPeer(id); peer != nil {
+		peer.mu.Lock()
+		peer.td = td
+		peer.recentHash = block.Hash()
+		peer.mu.Unlock()
+	}
+
+	// if neither go ahead to process
+	if !(d.isFetchingHashes() || d.isDownloadingBlocks()) {
+		d.process()
+	}
+}
+
+// Deliver a chunk to the downloader. This is usually done through the BlocksMsg by
+// the protocol handler.
+func (d *Downloader) DeliverChunk(id string, blocks []*types.Block) {
+	d.blockCh <- blockPack{id, blocks}
+}
+
+func (d *Downloader) process() error {
+	atomic.StoreInt32(&d.processingBlocks, 1)
+	defer atomic.StoreInt32(&d.processingBlocks, 0)
+
+	// XXX this will move when optimised
+	// Sort the blocks by number. This bit needs much improvement. Right now
+	// it assumes full honesty form peers (i.e. it's not checked when the blocks
+	// link). We should at least check whihc queue match. This code could move
+	// to a seperate goroutine where it periodically checks for linked pieces.
+	types.BlockBy(types.Number).Sort(d.queue.blocks)
+	blocks := d.queue.blocks
+
+	glog.V(logger.Debug).Infoln("Inserting chain with", len(blocks), "blocks")
+
+	var err error
+	// Loop untill we're out of blocks
+	for len(blocks) != 0 {
+		max := int(math.Min(float64(len(blocks)), 256))
+		// TODO check for parent error. When there's a parent error we should stop
+		// processing and start requesting the `block.hash` so that it's parent and
+		// grandparents can be requested and queued.
+		err = d.insertChain(blocks[:max])
+		if err != nil && core.IsParentErr(err) {
+			glog.V(logger.Debug).Infoln("Aborting process due to missing parent. Fetching hashes")
+
+			// TODO change this. This shite
+			for i, block := range blocks[:max] {
+				if !d.hasBlock(block.ParentHash()) {
+					d.syncCh <- syncPack{d.peers.bestPeer(), block.Hash()}
+					// remove processed blocks
+					blocks = blocks[i:]
+
+					break
+				}
+			}
+			break
+		}
+		blocks = blocks[max:]
+	}
+
+	// This will allow the GC to remove the in memory blocks
+	if len(blocks) == 0 {
+		d.queue.blocks = nil
+	} else {
+		d.queue.blocks = blocks
+	}
+	return err
+}
+
 func (d *Downloader) isFetchingHashes() bool {
 	return atomic.LoadInt32(&d.fetchingHashes) == 1
 }
@@ -332,3 +397,7 @@ func (d *Downloader) isFetchingHashes() bool {
 func (d *Downloader) isDownloadingBlocks() bool {
 	return atomic.LoadInt32(&d.downloadingBlocks) == 1
 }
+
+func (d *Downloader) isProcessing() bool {
+	return atomic.LoadInt32(&d.processingBlocks) == 1
+}
--- a/eth/downloader/downloader_test.go
+++ b/eth/downloader/downloader_test.go
@@ -14,7 +14,7 @@ import (

 var knownHash = common.Hash{1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}

-func createHashes(amount int) (hashes []common.Hash) {
+func createHashes(start, amount int) (hashes []common.Hash) {
 	hashes = make([]common.Hash, amount+1)
 	hashes[len(hashes)-1] = knownHash

@@ -49,7 +49,7 @@ type downloadTester struct {

 func newTester(t *testing.T, hashes []common.Hash, blocks map[common.Hash]*types.Block) *downloadTester {
 	tester := &downloadTester{t: t, hashes: hashes, blocks: blocks, done: make(chan bool)}
-	downloader := New(tester.hasBlock, tester.insertChain)
+	downloader := New(tester.hasBlock, tester.insertChain, func() *big.Int { return new(big.Int) })
 	tester.downloader = downloader

 	return tester
@@ -84,7 +84,7 @@ func (dl *downloadTester) getBlocks(id string) func([]common.Hash) error {
 			blocks[i] = dl.blocks[hash]
 		}

-		go dl.downloader.DeliverBlocks(id, blocks)
+		go dl.downloader.DeliverChunk(id, blocks)

 		return nil
 	}
@@ -109,11 +109,11 @@ func TestDownload(t *testing.T) {
 	glog.SetV(logger.Detail)
 	glog.SetToStderr(true)

-	hashes := createHashes(1000)
+	hashes := createHashes(0, 1000)
 	blocks := createBlocksFromHashes(hashes)
 	tester := newTester(t, hashes, blocks)

-	tester.newPeer("peer1", big.NewInt(10000), hashes[len(hashes)-1])
+	tester.newPeer("peer1", big.NewInt(10000), hashes[0])
 	tester.newPeer("peer2", big.NewInt(0), common.Hash{})
 	tester.badBlocksPeer("peer3", big.NewInt(0), common.Hash{})
 	tester.badBlocksPeer("peer4", big.NewInt(0), common.Hash{})
@@ -126,3 +126,30 @@ success:
 		t.Error("timout")
 	}
 }
+
+func TestMissing(t *testing.T) {
+	t.Skip()
+
+	glog.SetV(logger.Detail)
+	glog.SetToStderr(true)
+
+	hashes := createHashes(0, 1000)
+	extraHashes := createHashes(1001, 1003)
+	blocks := createBlocksFromHashes(append(extraHashes, hashes...))
+	tester := newTester(t, hashes, blocks)
+
+	tester.newPeer("peer1", big.NewInt(10000), hashes[len(hashes)-1])
+
+	hashes = append(extraHashes, hashes[:len(hashes)-1]...)
+	tester.newPeer("peer2", big.NewInt(0), common.Hash{})
+
+success1:
+	select {
+	case <-tester.done:
+		break success1
+	case <-time.After(10 * time.Second): // XXX this could actually fail on a slow computer
+		t.Error("timout")
+	}
+
+	tester.downloader.AddBlock("peer2", blocks[hashes[len(hashes)-1]], big.NewInt(10001))
+}
--- a/eth/downloader/peer.go
+++ b/eth/downloader/peer.go
@@ -13,9 +13,51 @@ const (
 	idleState    = 4
 )

+type hashFetcherFn func(common.Hash) error
+type blockFetcherFn func([]common.Hash) error
+
+// XXX make threadsafe!!!!
+type peers map[string]*peer
+
+func (p peers) get(state int) []*peer {
+	var peers []*peer
+	for _, peer := range p {
+		peer.mu.RLock()
+		if peer.state == state {
+			peers = append(peers, peer)
+		}
+		peer.mu.RUnlock()
+	}
+
+	return peers
+}
+
+func (p peers) setState(id string, state int) {
+	if peer, exist := p[id]; exist {
+		peer.mu.Lock()
+		defer peer.mu.Unlock()
+		peer.state = state
+	}
+}
+
+func (p peers) getPeer(id string) *peer {
+	return p[id]
+}
+
+func (p peers) bestPeer() *peer {
+	var peer *peer
+	for _, cp := range p {
+		if peer == nil || cp.td.Cmp(peer.td) > 0 {
+			peer = cp
+		}
+	}
+	return peer
+}
+
 // peer represents an active peer
 type peer struct {
-	state int
+	state int // Peer state (working, idle)
+	rep   int // TODO peer reputation

 	mu         sync.RWMutex
 	id         string

--- a/eth/downloader/queue.go
+++ b/eth/downloader/queue.go
@@ -2,16 +2,20 @@ package downloader

 import (
 	"math"
+	"math/big"
 	"sync"
 	"time"

+	"github.com/ethereum/go-ethereum/common"
 	"github.com/ethereum/go-ethereum/core/types"
 	"gopkg.in/fatih/set.v0"
 )

 // queue represents hashes that are either need fetching or are being fetched
 type queue struct {
-	hashPool *set.Set
+	hashPool    *set.Set
+	fetchPool   *set.Set
+	blockHashes *set.Set

 	mu       sync.Mutex
 	fetching map[string]*chunk
@@ -20,8 +24,10 @@ type queue struct {

 func newqueue() *queue {
 	return &queue{
-		hashPool: set.New(),
-		fetching: make(map[string]*chunk),
+		hashPool:    set.New(),
+		fetchPool:   set.New(),
+		blockHashes: set.New(),
+		fetching:    make(map[string]*chunk),
 	}
 }

@@ -50,6 +56,8 @@ func (c *queue) get(p *peer, max int) *chunk {
 	})
 	// remove the fetchable hashes from hash pool
 	c.hashPool.Separate(hashes)
+	c.fetchPool.Merge(hashes)
+
 	// Create a new chunk for the seperated hashes. The time is being used
 	// to reset the chunk (timeout)
 	chunk := &chunk{hashes, time.Now()}
@@ -60,6 +68,22 @@ func (c *queue) get(p *peer, max int) *chunk {
 	return chunk
 }

+func (c *queue) has(hash common.Hash) bool {
+	return c.hashPool.Has(hash) || c.fetchPool.Has(hash)
+}
+
+func (c *queue) addBlock(id string, block *types.Block, td *big.Int) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	// when adding a block make sure it doesn't already exist
+	if !c.blockHashes.Has(block.Hash()) {
+		c.hashPool.Remove(block.Hash())
+		c.blocks = append(c.blocks, block)
+	}
+}
+
+// deliver delivers a chunk to the queue that was requested of the peer
 func (c *queue) deliver(id string, blocks []*types.Block) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
@@ -70,15 +94,19 @@ func (c *queue) deliver(id string, blocks []*types.Block) {
 		delete(c.fetching, id)

 		// seperate the blocks and the hashes
-		chunk.seperate(blocks)
+		blockHashes := chunk.fetchedHashes(blocks)
+		// merge block hashes
+		c.blockHashes.Merge(blockHashes)
 		// Add the blocks
 		c.blocks = append(c.blocks, blocks...)

 		// Add back whatever couldn't be delivered
 		c.hashPool.Merge(chunk.hashes)
+		c.fetchPool.Separate(chunk.hashes)
 	}
 }

+// puts puts sets of hashes on to the queue for fetching
 func (c *queue) put(hashes *set.Set) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
@@ -91,8 +119,12 @@ type chunk struct {
 	itime  time.Time
 }

-func (ch *chunk) seperate(blocks []*types.Block) {
+func (ch *chunk) fetchedHashes(blocks []*types.Block) *set.Set {
+	fhashes := set.New()
 	for _, block := range blocks {
-		ch.hashes.Remove(block.Hash())
+		fhashes.Add(block.Hash())
 	}
+	ch.hashes.Separate(fhashes)
+
+	return fhashes
 }