1.生成文本测试文件
张泽鹏先生创作
import random
import re
import string
def main():
values = re.sub(r'\s+', '', string.printable)
for _ in range(100_0000):
length = random.randint(10, 100)
line = ''.join(random.choices(values, k=length))
print(line)
if __name__ == '__main__':
main()
执行
time python3 gendata.py >varchar.txt
real 0m8.522s
user 0m8.364s
sys 0m0.096s
2.c语言
张泽鹏先生创作,我将他的比较函数改为库函数strcmp
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
typedef char* String;
typedef int File;
typedef size_t Size;
typedef int Ordering;
typedef struct _FileSorter {
String name; // 文件名
File file; // 文件描述符
Size char_count; // 字节数:文件长度
Size line_count; // 行数:`\n`作为换行符,末尾不包含`\n`也算一行
String content; // 文件内容
String* lines; // 行首指针
} *FileSorter;
FileSorter CreateFileSorter(String);
void OpenFileSorterFile(FileSorter);
void CountFileSorterChars(FileSorter);
void MapFileSorterChars(FileSorter);
void CountFileSorterLines(FileSorter);
void MapFileSorterLines(FileSorter);
void SortFileSorterLines(FileSorter);
void DisplayFileSorterLines(FileSorter);
void DisposeFileSorter(FileSorter);
FileSorter CreateFileSorter(String name) {
FileSorter this = (FileSorter)calloc(sizeof(struct _FileSorter), 1);
this->name = name;
OpenFileSorterFile(this); // 获取文件句柄
CountFileSorterChars(this); // 获取文件长度
MapFileSorterChars(this); // 把文件内容映射到内存
CountFileSorterLines(this); // 统计行数
MapFileSorterLines(this); // 映射行首指针
// 映射文本行
return this;
}
void DisposeFileSorter(FileSorter this) {
if (this != NULL) {
if (this->lines != NULL) {
free(this->lines);
this->lines = NULL;
}
if (this->content != NULL) {
munmap(this->content, this->char_count);
}
if (this->file > 0) {
close(this->file);
}
free(this);
}
}
/**
* 打开文件句柄。
*/
void OpenFileSorterFile(FileSorter this) {
this->file = open(this->name, O_RDONLY);
if (this->file < 0) {
fprintf(stderr, "open %s failed\n", this->name);
exit(EXIT_FAILURE);
}
}
/**
* 统计文件的字符数。
*/
void CountFileSorterChars(FileSorter this) {
struct stat status;
if (fstat(this->file, &status) < 0) {
close(this->file);
fprintf(stderr, "count %s failed\n", this->name);
exit(EXIT_FAILURE);
}
this->char_count = status.st_size;
}
/**
* 把文件内容映射到内存。
*/
void MapFileSorterChars(FileSorter this) {
this->content = mmap(NULL, this->char_count, PROT_READ, MAP_PRIVATE, this->file, 0);
if (this->content == MAP_FAILED) {
close(this->file);
fprintf(stderr, "mmap %s failed\n", this->name);
exit(EXIT_FAILURE);
}
}
/**
* 统计文件的行数。
*/
void CountFileSorterLines(FileSorter this) {
this->line_count = 0;
String content = this->content;
while (content != NULL && *content) {
this->line_count++;
content = strchr(content, '\n');
if (content != NULL) {
content++;
}
}
}
/**
* 映射行首指针。
*/
void MapFileSorterLines(FileSorter this) {
this->lines = (String*)calloc(sizeof(String), this->line_count);
this->lines[0] = this->content;
for (int index = 1; index < this->line_count; index++) {
this->lines[index] = strchr(this->lines[index - 1], '\n') + 1;
}
}
/*
int compare(const void* left, const void* right) {
String a = *(String*)left;
String b = *(String*)right;
for (; *a && *b && *a != '\n' && *b != '\n'; a++, b++) {
if (*a < *b) {
return -1;
} else if (*a > *b) {
return 1;
}
}
return 0;
}
*/
int compare(const void* left, const void* right) {
String a = *(String*)left;
String b = *(String*)right;
return strcmp(a, b);
}
/**
* 排序。
*/
void SortFileSorterLines(FileSorter this) {
qsort(this->lines, this->line_count, sizeof(String), compare);
}
/**
* 输出。
*/
void DisplayFileSorterLines(FileSorter this) {
for (int index = 0; index < this->line_count; index++) {
for (String line = this->lines[index]; *line && *line != '\n'; line++) {
putchar(*line);
}
putchar('\n');
}
}
int main(int argc, char* argv[]) {
if (argc < 2) {
fprintf(stderr, "Usage: %s filename\n", argv[0]);
exit(EXIT_FAILURE);
}
FileSorter sorter = CreateFileSorter(argv[1]);
SortFileSorterLines(sorter);
DisplayFileSorterLines(sorter);
DisposeFileSorter(sorter);
return EXIT_SUCCESS;
}
编译执行
gcc zhangmainstrcmp.c -o zhangzpdockero2 -O2
root@66d4e20ec1d7:/par# time ./zhangzpdockero2 varchar.txt >csort.txt
real 0m0.944s
user 0m0.884s
sys 0m0.036s
3.python语言
张泽鹏先生创作
import sys;
with open(sys.argv[1], encoding='utf-8') as file:
for line in sorted([line.strip() for line in file]):
print(line)
执行
time python3 4lines.py varchar.txt >qsort.txt
real 0m1.769s
user 0m1.504s
sys 0m0.104s
4.zig语言
DeepSeek编写,提示词
我有一个文本文件str.txt,每行只包含一个字符串,请编写从此文件读取全部字符串、升序排序后按新顺序逐行输出全部字符串的程序,实现类似linux命令sort的效果,结果可重定向到文件,以便检查。
const std = @import("std");
//const builtin = @import("builtin");
const mem = std.mem;
const fs = std.fs;
const sort = std.sort;
pub fn main() !void {
// 使用通用分配器
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
// 获取命令行参数
const args = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, args);
if (args.len < 2) {
std.debug.print("Usage: {s} <filename>\n", .{args[0]});
return error.InvalidArgs;
}
const filename = args[1];
// 读取文件内容
const file = try fs.cwd().openFile(filename, .{});
defer file.close();
const file_size = (try file.stat()).size;
const file_content = try file.readToEndAlloc(allocator, file_size);
defer allocator.free(file_content);
// 分割为行
var lines = std.ArrayList([]const u8).init(allocator);
defer lines.deinit();
var line_iter = mem.tokenizeSequence(u8, file_content, "\n");
while (line_iter.next()) |line| {
try lines.append(line);
}
// 排序字符串 (按字典序升序)
std.mem.sort([]const u8, lines.items, {}, comptime ascString);
// 输出排序结果 (可重定向到文件)
const stdout = std.io.getStdOut().writer();
for (lines.items) |line| {
try stdout.print("{s}\n", .{line});
//std.debug.print("{s}\n", .{line});
//std.fs.File.stdout().print("{s}\n", .{line});
}
}
// 字符串升序比较函数
fn ascString(_: void, a: []const u8, b: []const u8) bool {
return mem.order(u8, a, b) == .lt;
}
编译执行
zig build-exe sortstrfile.zig -O ReleaseFast
time ./sortstrfile varchar.txt >zigsort.txt
real 0m3.718s
user 0m0.824s
sys 0m2.876s
5.DuckDB 1.13 SQL
copy (select i from read_csv('varchar.txt',header=0,all_varchar=1)t(i)order by i)to 'sortvarchar100w.txt';
Run Time (s): real 0.374 user 1.872000 sys 0.124000
6.julia
让deepseek翻译上述python代码,再经过我的调整,从多行语句简化为单行,同时提高了效率。
# 读取文件内容,排序后逐行输出
if length(ARGS) < 1
println("Usage: julia script.jl filename")
exit(1)
end
filename = ARGS[1]
# 读取文件并排序
#lines = open(filename) do file
# readlines(file)
#end
# 去除每行首尾空白字符并排序
# sorted_lines = sort([strip(line) for line in lines])
# 输出结果
# foreach(println, sorted_lines)
# foreach(println, sort([strip(line) for line in lines]))
# foreach(println, sort([line for line in lines]))
# foreach(println, sort(lines))
foreach(println,sort(open(filename) do file readlines(file) end))
执行
time julia-1.12.0-rc1/bin/julia sort_lines.jl varchar.txt >jlnosort.txt
real 0m1.541s
user 0m1.508s
sys 0m0.132s
- Linux sort命令
time sort varchar.txt >linuxsort.txt
real 0m0.348s
user 0m0.852s
sys 0m0.056s
#单线程
time sort --parallel=1 varchar.txt >linuxsort1.txt
real 0m0.910s
user 0m0.652s
sys 0m0.064s
总结
Linux sort命令多线程最快,duckdb多线程第二,c语言第三,python语言出乎意料地快又简洁,zig最慢,慢得出奇。
不同语言对百万行文本文件排序对比


被折叠的 条评论
为什么被折叠?



