Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

External Sort Algorithm #29

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 220 additions & 0 deletions src/include/utils/externalSort.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@

#include <algorithm>
#include <fstream>
#include <iostream>

namespace buzzdb {
namespace utils {

// #define UNUSED(p) ((void)(p))

// void hello_world(){
// printf("Hello world");
// }

struct MinHeapNode {
int elem;
int x;
};

void swap(MinHeapNode* a, MinHeapNode *b);

class MinHeap{

MinHeapNode* node;
int sizeHeap;

public:
MinHeap(MinHeapNode x[], int size);
void MinHeapify(int);
int right (int x) {
return (2 * x + 2);
}
int left (int x) {
return (2 * x + 1);
}

MinHeapNode getMinHeap() {
return node[0];
}

void replaceMinNode(MinHeapNode x) {
node[0] = x;
MinHeapify(0);
}
};

MinHeap::MinHeap(MinHeapNode x[], int size) {
sizeHeap = size;
node = x;
for (int a = (sizeHeap - 1) / 2; a >= 0; a--) {
MinHeapify(a);
}

}

void MinHeap::MinHeapify(int x) {
int r = right(x);
int l = left(x);
int smaller = x;
if (l < sizeHeap && node[l].elem < node[x].elem)
smaller = l;
if (r < sizeHeap && node[r].elem < node[smaller].elem)
smaller = r;
if (smaller != x) {
swap(&node[x], &node[smaller]);
MinHeapify(smaller);
}
}

void swap(MinHeapNode* a, MinHeapNode* b)
{
MinHeapNode temp = *a;
*a = *b;
*b = temp;
}

void merge(int arr[], int left, int mid, int right)
{

int i1 = mid - left + 1;
int i2 = right - mid;

int L[i1], R[i2];
int a = 0;
while (a < i1) {
L[a] = arr[left + a];
a++;
}
int b = 0;
while (b < i2) {
R[b] = arr[mid + 1 + b];
b++;
}

a = 0;
b = 0;
int c = l;
while (a < i1 && b < i2) {
if (L[a] <= R[b])
arr[c++] = L[a++];
else
arr[c++] = R[b++];
}
while (a < i1) {
arr[c++] = L[a++];
}
while (b < i2) {
arr[c++] = R[b++];
}
}

void mergeSort(int arr[], int left, int right) {

if (left < right) {
int mid = left + (right - left) / 2;
mergeSort(arr, left, mid);
mergeSort(arr, mid + 1, right);
merge(arr, left, mid, right);
}
}

FILE* openFile(char* fileName, char* mode)
{
FILE* file = fopen(fileName, mode);
if (file == NULL) {
perror("Error: Cannot open file");
exit(EXIT_FAILURE);
}
return file;
}
void mergeData(char* openedFile, int n, int k) {

FILE* in[k];
int x = 0;
while ( x < k) {
char fileName[2];
snprintf(fileName, sizeof(fileName), "%d", x);
in[x] = openFile(fileName, "r");
x++;
}
FILE* out = openFile(openedFile, "w");
MinHeapNode* node = new MinHeapNode[k];
int y = 0;
while (y < k) {
if (fscanf(in[y], "%d ", &node[y].elem) != 1)
break;
node[y].y = y;
y++;
}
MinHeap hp(node, y);
int count = 0;
while (count != y) {
MinHeapNode root = hp.getMin();
fprintf(out, "%d ", root.elem);
if (fscanf(in[root.i], "%d ", &root.elem) != 1) {
root.elem = INT_MAX;
count++;
}
hp.replaceMin(root);
}
int z = 0;
while (z < k) {
fclose(in[z]);
}
fclose(out);
}
void initialiseData( char* inputFile, int memory, int num_ways) {

FILE* in = openFile(inputFile, "r");
FILE* out[num_ways];
char fileName[2];
int x = 0;
while (x <num_ways){

snprintf(fileName, sizeof(fileName), "%d", x);
out[x] = openFile(fileName, "w");
x++;

}

int* arr = (int*)malloc( memory * sizeof(int));
bool inputing = true;
int nextOutput = 0;


while (inputing) {
int y = 0;

while (y < memory) {
if (fscanf(in, "%d ", &arr[y]) != 1) {
inputing = false;
break;

}
y++;
}
mergeSort(arr, 0, y - 1);
int z = 0;
while (z < y){
fprintf(out[nextOutput], "%d ", arr[z]);
z++;
}
nextOutput++;
}
int a = 0;
while (a < num_ways) {
fclose(out[a]);
a++;
}
fclose(in);
}
void externalSort( char* inputFile, char* outputFile, int diffways, int mem) {

initialiseData(inputFile, mem, diffways);
mergeData(outputFile, mem, diffways);
}


} // namespace utils
} // namespace buzzdb
12 changes: 12 additions & 0 deletions src/include/utils/externalSort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <cstdint>
#include <memory>

namespace buzzdb{
namespace utils{

void externalSort( char* inputFile, char* outputFile, int diffways, int mem);

} // namespace utils
} // namespace buzzdb
1 change: 1 addition & 0 deletions src/include/utils/inputFile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
33 22 44 55 66 2
Empty file.
54 changes: 54 additions & 0 deletions src/include/utils/test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@

#include <gtest/gtest.h>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <exception>
#include <random>
#include <utility>
#include <vector>
#include <iostream>
#include <fstream>

#include "utils/file.h"

namespace {

TEST(FileTest, BasicTest) {

int randWays = 10;
int mem = 1000;
std::vector<uint64_t> values = {33, 22, 44, 55, 66, 2};
char inputFile[] = "inputFile.txt";
char outputFile[] = "outputFile.txt";
std::unique_ptr<char[]> read_block(size_t offset, size_t size) {
auto block = std::make_unique<char[]>(size);
read_block(offset, size, block.get());
return block;
}
FILE* input = openFile(inputFile, "w");

srand(time(NULL));
int x = 0;
while (x < randWays * mem) {
fprintf(input, "%d ", rand());
}
fclose(input);
externalSort(inputFile, outputFile, randWays, mem);
std::vector<uint64_t> values = {33, 22, 44, 55, 66, 2};
std::sort(values.begin(), values.end());
auto outValues = std::make_unique<uint64_t[]>(values.size());
outputFile->read_block(0, fileSize, reinterpret_cast<char*>(outValues.get()));

int y = 0;
while (int y < values.size()) {
EXPECT_EQ(values[y], outValues[y]);
y++;
}
}
} // namespace

int main(int argc, char *argv[]) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}