1. 程式人生 > >從海量資料中找中位數(c語言實現)

從海量資料中找中位數(c語言實現)

題目:5億個int,從中找出第k大的數

演算法:之後補上。。。

實現:

#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>

typedef struct bucket_t {
	int *buf;		/* 輸出緩衝區 */
	int count;		/* 當前有多少個數 */
	int idx;		/* 緩衝區的指標 */
} bucket_t;

static unsigned int BUF_PAGES;		/* 緩衝區有多少個page */
static unsigned int PAGE_SIZE;		/* page的大小 */
static unsigned int BUF_SIZE;		/* 緩衝區的大小, BUF_SIZE = BUF_PAGES*PAGE_SIZE */
static unsigned int nbuckets;		/* 分成多少個桶 */
static unsigned int BUCKET_BUF_SIZE;

static int *buffer;					/* 輸入緩衝區 */

long get_time_usecs();
void write_to_file(bucket_t *bucket, int pos);
int partition(int *a, int s, int t);
int quick_select(int *a, int s, int t, int i);
void swap(int *p, int *q);

int main(int argc, char **argv)
{
	char 				filename[20];
	unsigned int		bp, length, bucket_size, k;
	int					fd, i, bytes;
	bucket_t			*bucket;

	long start_usecs = get_time_usecs();

	strcpy(filename, argv[1]);
	fd = open(filename, O_RDONLY);
	if (fd < 0) {
		printf("can't open file %s\n", filename);
		exit(0);
	}
	nbuckets = 1024;
	k = atoi(argv[2]);
	PAGE_SIZE = 4096;							/* page = 4KB */
	BUF_PAGES = 1024;
	BUF_SIZE = PAGE_SIZE*BUF_PAGES;				/* 4KB * 1024 = 4M */
	BUCKET_BUF_SIZE = PAGE_SIZE*128;			/* 4KB * 128 = 512KB */
	buffer = (int *)malloc(BUF_SIZE);

	//把1-2^32個數分成nbucket個組, nbuckets必須等於2的n次冪
	bucket = malloc(sizeof(bucket_t)*nbuckets);	
	if (bucket == NULL) exit(0);
	for (i = 0; i < nbuckets; i++) {
		bucket[i].buf = malloc(BUCKET_BUF_SIZE);
		if (bucket[i].buf == NULL) {
			exit(0);
		}
		bucket[i].idx = 0;
		bucket[i].count = 0;
	}
	bucket_size = (1<<22);		/* 分成1024個桶,每個桶容納2^22個數 */

	// 讀入第一批資料到輸入緩衝區 
	bytes = read(fd, buffer, BUF_SIZE);
	length = bytes/4;
	bp = 0;

	int 			element, pos;
	unsigned int	base;
	bucket_t		*p;
	
	base = 2147483648;
	while (1) {
		//從輸入緩衝區取出一個數,加到對應的桶
		element = buffer[bp++];
		pos = (((long)element)+base)>>22;
		p = &bucket[pos];
		p->buf[p->idx++] = element;
		p->count++;
		//桶內的緩衝區已滿,寫入檔案
		if (p->idx*4 == BUCKET_BUF_SIZE) {
			write_to_file(p, pos);
			p->idx = 0;
		}
		//輸入緩衝區的數已用完
		if (bp == length) {
			bytes = read(fd, buffer, BUF_SIZE);
			if (bytes == 0) { 
				break;
			}
			length = bytes/4;
			bp = 0;
		}
	}

	//把每個桶剩下的數寫入檔案
	for (i = 0; i < nbuckets; i++) {
		write_to_file(bucket+i, i);
	}

	free(buffer);
	close(fd);

	buffer = malloc(bucket_size*4);
	if (buffer == NULL)  exit(0); 

	//找出第k大的數位於哪個檔案
	unsigned sum = 0;
	for (i = 0; i < nbuckets && sum < k; i++) {
		sum += bucket[i].count;
	}
	i--;

	//把該檔案讀入記憶體
	sprintf(filename, "foo_%d.dat", i);
	printf("第%d大的數位於檔案%s的第%d大的數\n", k, filename, k+bucket[i].count-sum);
	fd = open(filename, O_RDONLY);
	if (fd < 0) {
		printf("can't open file %s\n", filename);
		free(buffer);
		exit(0);
	}
	bytes = read(fd, buffer, bucket_size*4);
	length = bytes/4;

	//選擇檔案內第(k+bucket[i].count-sum)大的數
	int answer;
	answer = quick_select(buffer, 1, length-1, k+bucket[i].count-sum);
	printf("第%d大的數 = %d\n", k, answer);

	close(fd);
	free(buffer);

	//free buckets
	for (i = 0; i < nbuckets; i++) {
		free(bucket[i].buf);
	}
	free(bucket);

	long end_usecs = get_time_usecs();
	double secs = (double)(end_usecs - start_usecs) / (double)1000000;
	printf("it took %.02f seconds.\n", secs);

	return 0;
}

void write_to_file(bucket_t *bucket, int pos)
{
	char	filename[20];
	int		fd, bytes;

	sprintf(filename, "foo_%d.dat", pos);
	fd = open(filename, O_WRONLY | O_CREAT | O_APPEND, 0666);	
	if (fd < 0) {
		printf("can't open file %s\n", filename);
		exit(0);
	}
	bytes = write(fd, bucket->buf, bucket->idx*4);
	if (bucket->idx*4 != bytes) {
		printf("idx = %d, bytes = %d, write error\n", bucket->idx, bytes);
		close(fd);
		exit(0);
	}
	close(fd);
}

long get_time_usecs()
{
	struct timeval time;
	struct timezone tz;
	memset(&tz, '\0', sizeof(struct timezone));
	gettimeofday(&time, &tz);
	long usecs = time.tv_sec*1000000 + time.tv_usec;

	return usecs;
}

void swap(int *p, int *q)
{
	int		tmp;

	tmp = *p;
	*p = *q;
	*q = tmp;
}

/* 把a[t]作為參考,將陣列分成三部分: 小於等於a[t],
 * a[t]以及大於a[t],分割完畢後,a[t]所在的下標即是a[t]的順序
 */
int partition(int *a, int s, int t)
{
	int		i, j;	/* i用來遍歷a[s]...a[t-1], j指向大於x部分的第一個元素 */

	for (i = j = s; i < t; i++) {
		if (a[i] < a[t]) {
			swap(a+i, a+j);
			j++;
		}
	}
	swap(a+j, a+t);

	return j;
}

/* 選擇陣列中第i大的元素並返回 */
int quick_select(int *a, int s, int t, int i)
{
	int		p, m;

	if (s == t) return a[t];
	p = partition(a, s, t);
	m = p - s + 1;
	if (m == i) return a[p];
	if (m > i) {
		return quick_select(a, s, p-1, i);
	}
	return quick_select(a, p+1, t, i-m);
}
 

執行和測試:

尋找第1111大的整數

dd if=/dev/urandom of=random.dat bs=1M count=1024

gcc main.c

./a.out random.dat 1111