/*
 * kernel/power/suspend_block_io.c
 *
 * Copyright 2004-2006 Nigel Cunningham <nigel@suspend2.net>
 *
 * Distributed under GPLv2.
 * 
 * This file contains block io functions for suspend2. These are
 * used by the swapwriter and it is planned that they will also
 * be used by the NFSwriter.
 *
 */

#include <linux/blkdev.h>
#include <linux/syscalls.h>
#include <linux/suspend.h>

#include "suspend.h"
#include "sysfs.h"
#include "modules.h"
#include "prepare_image.h"
#include "block_io.h"
#include "ui.h"

/* Bits in struct io_info->flags */
enum {
	IO_AWAITING_READ,
	IO_AWAITING_SUBMIT,
	IO_AWAITING_CLEANUP,
};

#define MAX_OUTSTANDING_IO 2048

/*
 *
 *     IO in progress information storage and helpers
 *
 */

struct io_info {
	struct bio *sys_struct;
	sector_t block[MAX_BUF_PER_PAGE];
	struct page *buffer_page;
	struct page *data_page;
	unsigned long flags;
	struct block_device *dev;
	struct list_head list;
	int readahead_index;
};

/*
 * submit_params
 */
struct submit_params {
	swp_entry_t swap_address;
	struct page *page;
	struct block_device *dev;
	sector_t block[MAX_BUF_PER_PAGE];
	int readahead_index;
	struct submit_params *next;
	int printme;
};

/* Locks separated to allow better SMP support.
 * An io_struct moves through the lists as follows.
 * free -> submit_batch -> busy -> ready_for_cleanup -> free
 */
static LIST_HEAD(ioinfo_free);
static DEFINE_SPINLOCK(ioinfo_free_lock);

static LIST_HEAD(ioinfo_ready_for_cleanup);
static DEFINE_SPINLOCK(ioinfo_ready_lock);

static LIST_HEAD(ioinfo_submit_batch);
static DEFINE_SPINLOCK(ioinfo_submit_lock);

static LIST_HEAD(ioinfo_busy);
static DEFINE_SPINLOCK(ioinfo_busy_lock);

static atomic_t submit_batch;
static int submit_batch_size = 64;
static int submit_batched(void);

/* [Max] number of I/O operations pending */
static atomic_t outstanding_io;
static int max_outstanding_io = 0;
static atomic_t buffer_allocs, buffer_frees;

/* [Max] number of pages used for above struct */
static int infopages = 0;
static int maxinfopages = 0;

static int extra_page_forward = 0;

static volatile unsigned long suspend_readahead_flags[
	(MAX_OUTSTANDING_IO + BITS_PER_LONG - 1) / BITS_PER_LONG];
static spinlock_t suspend_readahead_flags_lock = SPIN_LOCK_UNLOCKED;
static struct page *suspend_readahead_pages[MAX_OUTSTANDING_IO];
static int readahead_index, readahead_submit_index;

static int current_stream;
struct extent_iterate_saved_state suspend_writer_posn_save[3];

/* Pointer to current entry being loaded/saved. */
struct extent_iterate_state suspend_writer_posn;

/* Not static, so that the allocators can setup and complete
 * writing the header */
char *suspend_writer_buffer;
int suspend_writer_buffer_posn;

int suspend_read_fd;

static unsigned long nr_schedule_calls[8];

static char *sch_caller[] = {
	"get_io_info_struct #1    ",
	"get_io_info_struct #2    ",
	"get_io_info_struct #3    ",
	"suspend_finish_all_io    ",
	"wait_on_one_page         ",
	"submit                   ",
	"start_one                ",
	"suspend_wait_on_readahead",
};

static struct suspend_bdev_info *suspend_devinfo;

int suspend_header_bytes_used = 0;

/*
 * suspend_reset_io_stats
 *
 * Description:	Reset all our sanity-checking statistics.
 */
static void suspend_reset_io_stats(void)
{
	int i;
	
	max_outstanding_io = 0;
	maxinfopages = 0;
	
	for (i = 0; i < 8; i++)
		nr_schedule_calls[i] = 0;
}

/*
 * suspend_check_io_stats
 *
 * Description:	Check that our statistics look right and print
 * 		any debugging info wanted.
 */
static void suspend_check_io_stats(void)
{
	int i;

	BUG_ON(atomic_read(&outstanding_io));
	BUG_ON(infopages);
	BUG_ON(!list_empty(&ioinfo_submit_batch));
	BUG_ON(!list_empty(&ioinfo_busy));
	BUG_ON(!list_empty(&ioinfo_ready_for_cleanup));
	BUG_ON(!list_empty(&ioinfo_free));
	BUG_ON(atomic_read(&buffer_allocs) != atomic_read(&buffer_frees));

	suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0,
			"Maximum outstanding_io was %d.\n",
			max_outstanding_io);
	suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0,
			"Max info pages was %d.\n",
			maxinfopages);
	if (atomic_read(&buffer_allocs) != atomic_read(&buffer_frees))
		suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0,
			"Buffer allocs (%d) != buffer frees (%d)",
				atomic_read(&buffer_allocs),
				atomic_read(&buffer_frees));
	for(i = 0; i < 8; i++)
		suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0,
			"Nr schedule calls %s: %lu.\n", sch_caller[i],
			nr_schedule_calls[i]);
}

/*
 * __suspend_bio_cleanup_one
 * 
 * Description: Clean up after completing I/O on a page.
 * Arguments:	struct io_info:	Data for I/O to be completed.
 */
static void __suspend_bio_cleanup_one(struct io_info *io_info)
{
	struct page *buffer_page;
	struct page *data_page;
	char *buffer_address, *data_address;
	int reading;

	buffer_page = io_info->buffer_page;
	data_page = io_info->data_page;

	reading = test_bit(IO_AWAITING_READ, &io_info->flags);
	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0,
		"Cleanup IO: [%p]\n", 
		io_info);

	if (reading && io_info->readahead_index == -1) {
		/*
		 * Copy the page we read into the buffer our caller provided.
		 */
		data_address = (char *) kmap(data_page);
		buffer_address = (char *) kmap(buffer_page);
		memcpy(data_address, buffer_address, PAGE_SIZE);
		kunmap(data_page);
		kunmap(buffer_page);
	
	}

	if (!reading || io_info->readahead_index == -1) {
		/* Sanity check */
		if (page_count(buffer_page) != 2)
			printk(KERN_EMERG "Cleanup IO: Page count on page %p"
					" is %d. Not good!\n",
					buffer_page, page_count(buffer_page));
		put_page(buffer_page);
		__free_page(buffer_page);
		atomic_inc(&buffer_frees);
	} else
		put_page(buffer_page);
	
	bio_put(io_info->sys_struct);
	io_info->sys_struct = NULL;
	io_info->flags = 0;
}

/* __suspend_io_cleanup
 */

static int suspend_bio_cleanup_one(void *data)
{
	struct io_info *io_info = (struct io_info *) data;
	int readahead_index;
	unsigned long flags;

	/*
	 * If this I/O was a readahead, remember its index.
	 */
	readahead_index = io_info->readahead_index;

	/*
	 * Add it to the free list.
	 */
	list_del_init(&io_info->list);
	
	/*
	 * Do the cleanup.
	 */
	__suspend_bio_cleanup_one(io_info);

	/*
	 * Record the readahead as done.
	 */
	if (readahead_index > -1) {
		int index = readahead_index/BITS_PER_LONG;
		int bit = readahead_index - (index * BITS_PER_LONG);
		spin_lock_irqsave(&suspend_readahead_flags_lock, flags);
		set_bit(bit, &suspend_readahead_flags[index]);
		spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags);
	}

	spin_lock_irqsave(&ioinfo_free_lock, flags);
	list_add_tail(&io_info->list, &ioinfo_free);
	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
	
	/* Important: Must be last thing we do to avoid a race with
	 * finish_all_io when using keventd to do the cleanup */
	atomic_dec(&outstanding_io);

	return 0;
}

/* suspend_cleanup_some_completed_io
 *
 * NB: This is designed so that multiple callers can be in here simultaneously.
 */

static void suspend_cleanup_some_completed_io(void)
{
	int num_cleaned = 0;
	struct io_info *first;
	unsigned long flags;

	spin_lock_irqsave(&ioinfo_ready_lock, flags);
	while(!list_empty(&ioinfo_ready_for_cleanup)) {
		int result;
		first = list_entry(ioinfo_ready_for_cleanup.next,
				struct io_info, list);

		BUG_ON(!test_and_clear_bit(IO_AWAITING_CLEANUP, &first->flags));

		list_del_init(&first->list);

		spin_unlock_irqrestore(&ioinfo_ready_lock, flags);

		result = suspend_bio_cleanup_one((void *) first);

		spin_lock_irqsave(&ioinfo_ready_lock, flags);
		if (result)
			continue;
		num_cleaned++;
		if (num_cleaned == submit_batch_size)
			break;
	}
	spin_unlock_irqrestore(&ioinfo_ready_lock, flags);
}

/* do_bio_wait
 *
 * Actions taken when we want some I/O to get run.
 * 
 * Submit any I/O that's batched up (if we're not already doing
 * that, unplug queues, schedule and clean up whatever we can.
 */
static void do_bio_wait(int caller)
{
	int num_submitted = 0;

	nr_schedule_calls[caller]++;
	
	/* Don't want to wait on I/O we haven't submitted! */
	num_submitted = submit_batched();

	kblockd_flush();
	
	io_schedule();

	suspend_cleanup_some_completed_io();
}

/*
 * suspend_finish_all_io
 *
 * Description:	Finishes all IO and frees all IO info struct pages.
 */
static void suspend_finish_all_io(void)
{
	struct io_info *this, *next = NULL;
	unsigned long flags;

	/* Wait for all I/O to complete. */
	while (atomic_read(&outstanding_io))
		do_bio_wait(2);

	spin_lock_irqsave(&ioinfo_free_lock, flags);
	
	/* 
	 * Two stages, to avoid using freed pages.
	 *
	 * First free all io_info structs on a page except the first.
	 */
	list_for_each_entry_safe(this, next, &ioinfo_free, list) {
		if (((unsigned long) this) & ~PAGE_MASK)
			list_del(&this->list);
	}

	/* 
	 * Now we have only one reference to each page, and can safely
	 * free pages, knowing we're not going to be trying to access the
	 * same page after freeing it.
	 */
	list_for_each_entry_safe(this, next, &ioinfo_free, list) {
		list_del(&this->list);
		free_page((unsigned long) this);
		infopages--;
		suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0,
				"[FreedIOPage %lx]", this);
	}
	
	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
}

/*
 * wait_on_one_page
 *
 * Description:	Wait for a particular I/O to complete.
 */
static void wait_on_one_page(struct io_info *io_info)
{
	do { do_bio_wait(3); } while (io_info->flags);
}

/*
 * wait_on_readahead
 *
 * Wait until a particular readahead is ready.
 */
static void suspend_wait_on_readahead(int readahead_index)
{
	int index = readahead_index / BITS_PER_LONG;
	int bit = readahead_index - index * BITS_PER_LONG;

	/* read_ahead_index is the one we want to return */
	while (!test_bit(bit, &suspend_readahead_flags[index]))
		do_bio_wait(6);
}

/*
 * readahead_done
 *
 * Returns whether the readahead requested is ready.
 */

static int suspend_readahead_ready(int readahead_index)
{
	int index = readahead_index / BITS_PER_LONG;
	int bit = readahead_index - (index * BITS_PER_LONG);

	return test_bit(bit, &suspend_readahead_flags[index]);
}

/* suspend_readahead_prepare
 * Set up for doing readahead on an image */
static int suspend_prepare_readahead(int index)
{
	unsigned long new_page = get_zeroed_page(GFP_ATOMIC);

	if(!new_page)
		return -ENOMEM;

	suspend_readahead_pages[index] = virt_to_page(new_page);
	return 0;
}

/* suspend_readahead_cleanup
 * Clean up structures used for readahead */
static void suspend_cleanup_readahead(int page)
{
	__free_page(suspend_readahead_pages[page]);
	suspend_readahead_pages[page] = 0;
	return;
}

/*
 * suspend_end_bio
 *
 * Description:	Function called by block driver from interrupt context when I/O
 * 		is completed. This is the reason we use spinlocks in
 * 		manipulating the io_info lists. 		
 * 		Nearly the fs/buffer.c version, but we want to mark the page as 
 * 		done in our own structures too.
 */

static int suspend_end_bio(struct bio *bio, unsigned int num, int err)
{
	struct io_info *io_info = bio->bi_private;
	unsigned long flags;

	spin_lock_irqsave(&ioinfo_busy_lock, flags);
	list_del_init(&io_info->list);
	spin_unlock_irqrestore(&ioinfo_busy_lock, flags);

	set_bit(IO_AWAITING_CLEANUP, &io_info->flags);
		
	spin_lock_irqsave(&ioinfo_ready_lock, flags);
	list_add_tail(&io_info->list, &ioinfo_ready_for_cleanup);
	spin_unlock_irqrestore(&ioinfo_ready_lock, flags);
	return 0;
}

/**
 *	submit - submit BIO request.
 *	@rw:	READ or WRITE.
 *	@io_info: IO info structure.
 *
 * 	Based on Patrick's pmdisk code from long ago:
 *	"Straight from the textbook - allocate and initialize the bio.
 *	If we're writing, make sure the page is marked as dirty.
 *	Then submit it and carry on."
 *
 *	With a twist, though - we handle block_size != PAGE_SIZE.
 *	Caller has already checked that our page is not fragmented.
 */

static int submit(int rw, struct io_info *io_info)
{
	int error = 0;
	struct bio *bio = NULL;
	unsigned long flags;

	while (!bio) {
		bio = bio_alloc(GFP_ATOMIC,1);
		if (!bio)
			do_bio_wait(4);
	}

	bio->bi_bdev = io_info->dev;
	bio->bi_sector = io_info->block[0];
	bio->bi_private = io_info;
	bio->bi_end_io = suspend_end_bio;
	bio->bi_flags |= (1 << BIO_SUSPEND_DATA);
	io_info->sys_struct = bio;

	if (bio_add_page(bio, io_info->buffer_page, PAGE_SIZE, 0) < PAGE_SIZE) {
		printk("ERROR: adding page to bio at %lld\n",
				(unsigned long long) io_info->block[0]);
		bio_put(bio);
		return -EFAULT;
	}

	if (rw == WRITE)
		bio_set_pages_dirty(bio);

	spin_lock_irqsave(&ioinfo_busy_lock, flags);
	list_add_tail(&io_info->list, &ioinfo_busy);
	spin_unlock_irqrestore(&ioinfo_busy_lock, flags);
	
	submit_bio(rw,bio);

	return error;
}

/* 
 * submit a batch. The submit function can wait on I/O, so we have
 * simple locking to avoid infinite recursion.
 */
static int submit_batched(void)
{
	static int running_already = 0;
	struct io_info *first;
	unsigned long flags;
	int num_submitted = 0;

	running_already = 1;
	spin_lock_irqsave(&ioinfo_submit_lock, flags);
	while(!list_empty(&ioinfo_submit_batch)) {
		first = list_entry(ioinfo_submit_batch.next, struct io_info,
									list);

		BUG_ON(!test_and_clear_bit(IO_AWAITING_SUBMIT, &first->flags));

		list_del_init(&first->list);

		atomic_dec(&submit_batch);

		spin_unlock_irqrestore(&ioinfo_submit_lock, flags);

		if (test_bit(IO_AWAITING_READ, &first->flags))
			submit(READ, first);
		else
			submit(WRITE, first);

		spin_lock_irqsave(&ioinfo_submit_lock, flags);
		
		num_submitted++;
		if (num_submitted == submit_batch_size)
			break;
	}
	spin_unlock_irqrestore(&ioinfo_submit_lock, flags);
	running_already = 0;

	return num_submitted;
}

static void add_to_batch(struct io_info *io_info)
{
	unsigned long flags;

	set_bit(IO_AWAITING_SUBMIT, &io_info->flags);

	/* Put our prepared I/O struct on the batch list. */
	spin_lock_irqsave(&ioinfo_submit_lock, flags);
	list_add_tail(&io_info->list, &ioinfo_submit_batch);
	spin_unlock_irqrestore(&ioinfo_submit_lock, flags);

	atomic_inc(&submit_batch);

	if (atomic_read(&submit_batch) >= submit_batch_size)
		submit_batched();
}

/*
 * get_io_info_struct
 *
 * Description:	Get an I/O struct.
 * Returns:	Pointer to the struct prepared for use.
 */
static struct io_info *get_io_info_struct(void)
{
	unsigned long newpage = 0, flags;
	struct io_info *this = NULL;
	int remaining = 0;

	do {
		while (atomic_read(&outstanding_io) >= MAX_OUTSTANDING_IO)
			do_bio_wait(0);

		/* Can start a new I/O. Is there a free one? */
		if (!list_empty(&ioinfo_free)) {
			/* Yes. Grab it. */
			spin_lock_irqsave(&ioinfo_free_lock, flags);
			break;
		}

		/* No. Need to allocate a new page for I/O info structs. */
		newpage = get_zeroed_page(GFP_ATOMIC);
		if (!newpage) {
			do_bio_wait(1);
			continue;
		}

		suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0,
				"[NewIOPage %lx]", newpage);
		infopages++;
		if (infopages > maxinfopages)
			maxinfopages++;

		/* Prepare the new page for use. */
		this = (struct io_info *) newpage;
		remaining = PAGE_SIZE;
		spin_lock_irqsave(&ioinfo_free_lock, flags);
		while (remaining >= (sizeof(struct io_info))) {
			list_add_tail(&this->list, &ioinfo_free);
			this = (struct io_info *) (((char *) this) + 
					sizeof(struct io_info));
			remaining -= sizeof(struct io_info);
		}
		break;
	} while (1);

	/*
	 * We have an I/O info struct. Remove it from the free list.
	 * It will be added to the submit or busy list later.
	 */
	this = list_entry(ioinfo_free.next, struct io_info, list);
	list_del_init(&this->list);
	spin_unlock_irqrestore(&ioinfo_free_lock, flags);
	return this;
}

/*
 * start_one
 *
 * Description:	Prepare and start a read or write operation.
 * 		Note that we use our own buffer for reading or writing.
 * 		This simplifies doing readahead and asynchronous writing.
 * 		We can begin a read without knowing the location into which
 * 		the data will eventually be placed, and the buffer passed
 * 		for a write can be reused immediately (essential for the
 * 		modules system).
 * 		Failure? What's that?
 * Returns:	The io_info struct created.
 */
static struct io_info *start_one(int rw, struct submit_params *submit_info)
{
	struct io_info *io_info = get_io_info_struct();
	unsigned long buffer_virt = 0;
	char *to, *from;
	struct page *buffer_page;

	if (!io_info)
		return NULL;

	/* Get our local buffer */
	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1,
			"Start_IO: [%p]", io_info);
	
	/* Copy settings to the io_info struct */
	io_info->data_page = submit_info->page;
	io_info->readahead_index = submit_info->readahead_index;

	if (io_info->readahead_index == -1) {
		while (!(buffer_virt = get_zeroed_page(GFP_ATOMIC)))
			do_bio_wait(5);

		atomic_inc(&buffer_allocs);
		suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0,
				"[ALLOC BUFFER]->%d",
				real_nr_free_pages());
		buffer_page = virt_to_page(buffer_virt);
	
		io_info->buffer_page = buffer_page;
	} else {
		unsigned long flags;
		int index = io_info->readahead_index / BITS_PER_LONG;
		int bit = io_info->readahead_index - index * BITS_PER_LONG;

		spin_lock_irqsave(&suspend_readahead_flags_lock, flags);
		clear_bit(bit, &suspend_readahead_flags[index]);
		spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags);

		io_info->buffer_page = buffer_page = submit_info->page;
	}

	/* If writing, copy our data. The data is probably in
	 * lowmem, but we cannot be certain. If there is no
	 * compression/encryption, we might be passed the
	 * actual source page's address. */
	if (rw == WRITE) {
		to = (char *) buffer_virt;
		from = kmap_atomic(io_info->data_page, KM_USER1);
		memcpy(to, from, PAGE_SIZE);
		kunmap_atomic(from, KM_USER1);
	}

	/* Submit the page */
	get_page(buffer_page);
	
	io_info->dev = submit_info->dev;
	io_info->block[0] = submit_info->block[0];

	if (rw == READ)
		set_bit(IO_AWAITING_READ, &io_info->flags);

	suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1,
			"-> (PRE BRW) %d\n",
			real_nr_free_pages());

	if (submit_batch_size > 1)
		add_to_batch(io_info);
	else
	 	submit(rw, io_info);
	
	atomic_inc(&outstanding_io);
	if (atomic_read(&outstanding_io) > max_outstanding_io)
		max_outstanding_io++;
	
	return io_info;
}

static int suspend_do_io(int rw, 
		struct submit_params *submit_info, int syncio)
{
	struct io_info *io_info;

	if(!submit_info->dev) {
		printk("Suspend_do_io: submit_info->dev is NULL!\n");
		return 1;
	}
	
	io_info = start_one(rw, submit_info);

	if (!io_info) {
		printk("Unable to allocate an io_info struct.\n");
		return 1;
	} else if (syncio)
		wait_on_one_page(io_info);

	/* If we were the only one, clean everything up */
	if (!atomic_read(&outstanding_io))
		suspend_finish_all_io();
	return 0;
} 

/* We used to use bread here, but it doesn't correctly handle
 * blocksize != PAGE_SIZE. Now we create a submit_info to get the data we
 * want and use our normal routines (synchronously).
 */

static int suspend_bdev_page_io(int rw, struct block_device *bdev, long pos,
		struct page *page)
{
	struct submit_params submit_info;

	if (!bdev)
		return 0;

	submit_info.page = page;
	submit_info.dev = bdev;
	submit_info.block[0] = pos;
	submit_info.readahead_index = -1;
	return suspend_do_io(rw, &submit_info, 1);
}

static unsigned long suspend_bio_memory_needed(void)
{
	/* We want to have at least enough memory so as to have
	 * MAX_OUTSTANDING_IO transactions on the fly at once. If we 
	 * can to more, fine. */
	return (MAX_OUTSTANDING_IO * (PAGE_SIZE + sizeof(struct request) +
				sizeof(struct bio) + sizeof(struct io_info)));
}

static void suspend_set_devinfo(struct suspend_bdev_info *info)
{
	suspend_devinfo = info;
}

static int forward_extra_blocks(void)
{
	int i;

	for (i = 1; i < suspend_devinfo[suspend_writer_posn.current_chain].
							blocks_per_page; i++)
		suspend_extent_state_next(&suspend_writer_posn);

	if (suspend_extent_state_eof(&suspend_writer_posn)) {
		printk("Extent state eof.\n");
		return -ENODATA;
	}

	return 0;
}

static int forward_one_page(void)
{
	int at_start = (suspend_writer_posn.current_chain == -1);

	/* Have to go forward one to ensure we're on the right chain,
	 * before we can know how many more blocks to skip.*/
	suspend_extent_state_next(&suspend_writer_posn);

	if (!at_start)
		if (forward_extra_blocks())
			return -ENODATA;

	if (extra_page_forward) {
		extra_page_forward = 0;
		return forward_one_page();
	}

	return 0;
}

static void set_extra_page_forward(void)
{
	extra_page_forward = 1;
}

static int suspend_rw_page(int rw, struct page *page,
		int readahead_index, int sync, int debug)
{
	int i, current_chain;
	struct submit_params submit_params;

	if (test_action_state(SUSPEND_TEST_FILTER_SPEED))
		return 0;
		
	submit_params.readahead_index = readahead_index;
	submit_params.page = page;
	
	if (forward_one_page()) {
		printk("Failed to advance a page in the extent data.\n");
		return -ENODATA;
	}

	current_chain = suspend_writer_posn.current_chain;
	submit_params.dev = suspend_devinfo[current_chain].bdev;
	submit_params.block[0] = suspend_writer_posn.current_offset <<
		suspend_devinfo[current_chain].bmap_shift;

	if (debug)
		printk("%s: %lx:%lx.\n", rw ? "Write" : "Read",
				(long) submit_params.dev->bd_dev,
				(long) submit_params.block[0]);

	i = suspend_do_io(rw, &submit_params, sync);

	if (i)
		return -EIO;

	return 0;
}

static int suspend_bio_read_chunk(struct page *buffer_page, int sync)
{
	static int last_result;
	unsigned long *virt;

	if (sync == SUSPEND_ASYNC)
		return suspend_rw_page(READ, buffer_page, -1, sync, 0);

	/* Start new readahead while we wait for our page */
	if (readahead_index == -1) {
		last_result = 0;
		readahead_index = readahead_submit_index = 0;
	}

	/* Start a new readahead? */
	if (last_result) {
		/* We failed to submit a read, and have cleaned up
		 * all the readahead previously submitted */
		if (readahead_submit_index == readahead_index)
			return -EPERM;
		goto wait;
	}
	
	do {
		if (suspend_prepare_readahead(readahead_submit_index))
			break;

		last_result = suspend_rw_page(
			READ,
			suspend_readahead_pages[readahead_submit_index], 
			readahead_submit_index, SUSPEND_ASYNC, 0);
		if (last_result) {
			printk("Begin read chunk for page %d returned %d.\n",
				readahead_submit_index, last_result);
			suspend_cleanup_readahead(readahead_submit_index);
			break;
		}

		readahead_submit_index++;

		if (readahead_submit_index == MAX_OUTSTANDING_IO)
			readahead_submit_index = 0;

	} while((!last_result) && (readahead_submit_index != readahead_index) &&
			(!suspend_readahead_ready(readahead_index)));

wait:
	suspend_wait_on_readahead(readahead_index);

	virt = kmap_atomic(buffer_page, KM_USER1);
	memcpy(virt, page_address(suspend_readahead_pages[readahead_index]),
			PAGE_SIZE);
	kunmap_atomic(virt, KM_USER1);

	suspend_cleanup_readahead(readahead_index);

	readahead_index++;
	if (readahead_index == MAX_OUTSTANDING_IO)
		readahead_index = 0;

	return 0;
}

static int suspend_rw_init(int rw, int stream_number)
{
	suspend_extent_state_restore(&suspend_writer_posn,
			&suspend_writer_posn_save[stream_number]);
	current_stream = stream_number;

	BUG_ON(!suspend_writer_posn.current_extent);

	suspend_reset_io_stats();

	readahead_index = readahead_submit_index = -1;

	return 0;
}

static int suspend_rw_cleanup(int rw)
{
	if (rw == WRITE && current_stream == 2)
		suspend_extent_state_save(&suspend_writer_posn,
				&suspend_writer_posn_save[1]);
	
	suspend_finish_all_io();
	
	if (rw == READ) {
		while (readahead_index != readahead_submit_index) {
			suspend_cleanup_readahead(readahead_index);
			readahead_index++;
			if (readahead_index == MAX_OUTSTANDING_IO)
				readahead_index = 0;
		}
	}

	suspend_check_io_stats();

	return 0;
}

static int suspend_write_chunk(struct page *buffer_page)
{
	return suspend_rw_page(WRITE, buffer_page, -1, 0, 0);
}

static int suspend_rw_header_chunk(int rw, struct suspend_module_ops *owner,
		char *buffer, int buffer_size)
{
	int bytes_left = buffer_size;
	
	if (owner) {
		owner->header_used += buffer_size;
		if (owner->header_used > owner->header_requested) {
			printk(KERN_EMERG "Suspend2 module %s is using more"
				"header space (%lu) than it requested (%lu).\n",
				owner->name,
				owner->header_used,
				owner->header_requested);
			BUG();
		}
	}

	/* Read a chunk of the header */
	while (bytes_left) {
		char *source_start = buffer + buffer_size - bytes_left;
		char *dest_start = suspend_writer_buffer + suspend_writer_buffer_posn;
		int capacity = PAGE_SIZE - suspend_writer_buffer_posn;
		char *to = rw ? dest_start : source_start;
		char *from = rw ? source_start : dest_start;

		if (bytes_left <= capacity) {
			if (test_debug_state(SUSPEND_HEADER))
				printk("Copy %d bytes %d-%d from %p to %p.\n",
						bytes_left,
						suspend_header_bytes_used,
						suspend_header_bytes_used + bytes_left,
						from, to);
			memcpy(to, from, bytes_left);
			suspend_writer_buffer_posn += bytes_left;
			suspend_header_bytes_used += bytes_left;
			return rw ? 0 : buffer_size;
		}

		/* Next to read the next page */
		if (test_debug_state(SUSPEND_HEADER))
			printk("Copy %d bytes (%d-%d) from %p to %p.\n",
					capacity,
					suspend_header_bytes_used,
					suspend_header_bytes_used + capacity,
					from, to);
		memcpy(to, from, capacity);
		bytes_left -= capacity;
		suspend_header_bytes_used += capacity;

		if (rw == READ && test_suspend_state(SUSPEND_TRY_RESUME_RD))
			sys_read(suspend_read_fd,
				suspend_writer_buffer, BLOCK_SIZE);
		else {
			if (suspend_rw_page(rw,
					virt_to_page(suspend_writer_buffer),
					-1, !rw,
					test_debug_state(SUSPEND_HEADER)))
				return -EIO;
		}

		suspend_writer_buffer_posn = 0;
		suspend_cond_pause(0, NULL);
	}

	return rw ? 0 : buffer_size;
}

static int write_header_chunk_finish(void)
{
	return suspend_rw_page(WRITE,
		virt_to_page(suspend_writer_buffer),
		-1, 0, test_debug_state(SUSPEND_HEADER)) ? -EIO : 0;
}

struct suspend_bio_ops suspend_bio_ops = {
	.bdev_page_io = suspend_bdev_page_io,
	.check_io_stats = suspend_check_io_stats,
	.reset_io_stats = suspend_reset_io_stats,
	.finish_all_io = suspend_finish_all_io,
	.prepare_readahead = suspend_prepare_readahead,
	.cleanup_readahead = suspend_cleanup_readahead,
	.readahead_pages = suspend_readahead_pages,
	.readahead_ready = suspend_readahead_ready,
	.forward_one_page = forward_one_page,
	.set_extra_page_forward = set_extra_page_forward,
	.set_devinfo = suspend_set_devinfo,
	.read_chunk = suspend_bio_read_chunk,
	.write_chunk = suspend_write_chunk,
	.rw_init = suspend_rw_init,
	.rw_cleanup = suspend_rw_cleanup,
	.rw_header_chunk = suspend_rw_header_chunk,
	.write_header_chunk_finish = write_header_chunk_finish,
};

static struct suspend_module_ops suspend_blockwriter_ops = 
{
	.name					= "Block I/O",
	.type					= MISC_MODULE,
	.module					= THIS_MODULE,
	.memory_needed				= suspend_bio_memory_needed,
};

static __init int suspend_block_io_load(void)
{
	return suspend_register_module(&suspend_blockwriter_ops);
}

#ifdef MODULE
static __exit void suspend_block_io_unload(void)
{
	suspend_unregister_module(&suspend_blockwriter_ops);
}

module_init(suspend_block_io_load);
module_exit(suspend_block_io_unload);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Nigel Cunningham");
MODULE_DESCRIPTION("Suspend2 block io functions");
#else
late_initcall(suspend_block_io_load);
#endif
