FileSort

最新推荐文章于 2025-08-14 23:18:15 发布
原创最新推荐文章于 2025-08-14 23:18:15 发布 · 538 阅读
0 ·
CC 4.0 BY-SA版权
Linux 同时被 2 个专栏收录
383 篇文章
订阅专栏
C++算法系列
256 篇文章
订阅专栏
/*************************************************************************
    > File Name: FileSort.h
    > Author: wangzhicheng
    > Mail: 2363702560@qq.com 
    > Created Time: Sat 31 Dec 2016 09:30:39 AM AWST
	> Brief:sort strings in the file
 ************************************************************************/
#ifndef FILE_SORT_H
#define FILE_SORT_H
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <omp.h>
#include <iostream>
#include <fstream>
#include <thread>
#include <algorithm>
#include <functional>  
namespace filesort
{
#define ASC 0
#define DESC 1
using namespace std;
class FileSort 
{
	public:
	/*
	 * @brief sort a file 
	 * @input_path the input file full path
	 * @sortype sort type 
	 * @return true if sort is ok
	 * */
	static bool sort(const char *input_path, const char *output_path, int sortype = ASC, int threadnum = 2);
	private:
	/*
	 * @brief use the stl sort
	 * */
	inline static void threadsort(vector<string>&input, int sortype)
	{
		switch(sortype)
		{
			case ASC:	// ASC
				std::sort(input.begin(), input.end());
				break;
			case DESC:	// DESC
				std::sort(input.rbegin(), input.rend());
				// lamba
				/*
				std::sort(input.begin(), input.end(), [](const string &str0, const string &str1)
						{
							return str0 > str1;
						});
						*/
				break;
		}
	}
};
}
#endif

/*************************************************************************
    > File Name: FileSort.cpp
    > Author: wangzhicheng
    > Mail: 2363702560@qq.com 
    > Created Time: Sat 31 Dec 2016 09:30:39 AM AWST
	> Brief: sort strings in a file
 ************************************************************************/
#include "FileSort.h"
namespace filesort
{
bool FileSort::sort(const char *input_path, const char *output_path, int sortype, int threadnum)
{
	// safe check entry arguments
	int MaxThreadNum = omp_get_num_procs() << 1;
	if(threadnum <= 0) threadnum = 1;
	if(threadnum >= MaxThreadNum) threadnum = MaxThreadNum;
	// open the input file
	ifstream is(input_path);
	if(!is)
	{
		cerr << input_path << " open failed...!" << endl;
		return false;
	}
	// import the string in the input file to various segments
	typedef vector<string> stringset;
	vector<stringset>segments;
	segments.resize(threadnum);		// every thead is responsible for every segment
	int index = 0;
	string line;
	while(getline(is, line))
	{
		segments[index].emplace_back(line);
		index = (index + 1) % threadnum;
	}
	is.close();
	// start theads to sort
	vector<thread>sortthreads;
	int i;
	for(i = 0;i < threadnum;i++)
	{
		stringset &strings = segments[i];
		sortthreads.push_back(thread(FileSort::threadsort, ref(strings), sortype));
	}
	for(auto &th:sortthreads)
	{
		th.join();
	}
	// open the output file
	ofstream os(output_path, ios::trunc);
	if(!os)
	{
		cerr << input_path << " open failed...!" << endl;
		return false;
	}
	// merge the vector to sort
	// init the heap
	vector<pair<string, int> >outstrings;	// first -- key second -- position
	for(i = 0;i < threadnum;i++)
	{
		stringset &strings = segments[i];
		if(!strings.empty())
		{
			outstrings.emplace_back(pair<string, int>(strings.front(), i));
			strings.erase(strings.begin());
		}
	}
	index = 0;
	switch(sortype)
	{
	case ASC:
		make_heap(outstrings.begin(), outstrings.end(), [](const pair<string, int>&p0, const pair<string, int>&p1)
				{
					return p0.first > p1.first;
				});
		break;
	case DESC:
		make_heap(outstrings.begin(), outstrings.end(), [](const pair<string, int>&p0, const pair<string, int>&p1)
				{
					return p0.first < p1.first;
				});
		break;
	}
	while(!outstrings.empty())
	{
		pop_heap(outstrings.begin(), outstrings.end());
		string &key = outstrings.back().first;
		int pos = outstrings.back().second;
		os << key << endl;
		outstrings.pop_back();
		stringset &strings = segments[pos];
		if(!strings.empty())
		{
			outstrings.emplace_back(pair<string, int>(strings.front(), pos));
			strings.erase(strings.begin());
		}
		switch(sortype)
		{
		case ASC:
			make_heap(outstrings.begin(), outstrings.end(), [](const pair<string, int>&p0, const pair<string, int>&p1)
					{
						return p0.first > p1.first;
					});
			break;
		case DESC:
			make_heap(outstrings.begin(), outstrings.end(), [](const pair<string, int>&p0, const pair<string, int>&p1)
					{
						return p0.first < p1.first;
					});
			break;
		}
	}
	os.close();
}
}

/*************************************************************************
    > File Name: main.cpp
    > Author: wangzhicheng
    > Mail: 2363702560@qq.com 
    > Created Time: Sat 31 Dec 2016 09:44:59 AM AWST
 ************************************************************************/

#include "FileSort.h"
#include <iterator>
#include <time.h>
using namespace filesort;
int main()
{
	// generate testing data
	static const int N = 10;
	static const int MAX = 100;
	int i, j;
	ofstream test_input("./input_data", ios::trunc);
	if(!test_input)
	{
		cerr << "input file generate failed...!" << endl;
		return 1;
	}
	char buf[64];
	srand(time(0));
	for(i = 0;i < MAX;i++)
	{
		for(j = 0;j < N;j++)
		{
			sprintf(buf + j, "%d", rand() % N);
		}
		test_input << buf << endl; 
	}
	test_input.close();
	// sort
	FileSort::sort("./input_data", "./output_data", DESC);

	return 0;
}

CC = g++
DBG = 

ifndef  DEBUG_SET
	DEBUG_SET=  -std=c++11 -g -pthread
endif

IFLAGS =-I .\

INDEX_ROOT=..
LIBS =	-L .\
		-lpthread\
		-lgomp\

LINK = 


TARGET=MergeSort
all:$(TARGET)

OBJS=FileSort.o\
	 main.o\

$(TARGET):$(OBJS)
	$(CC) -fPIC -o ./$(TARGET) $(OBJS) $(LIBS) $(LINK)
	
.cpp.o:
	$(CC) $(DBG) $(DEBUG_SET) $(IFLAGS) -fPIC -c $<
	
clean:
	rm *.o -fr
	rm -f MergeSort