C语言文本分析-优快云博客

本文介绍了一个使用C语言实现的程序，该程序能够从大型文本文件中统计并输出出现频率最高的100个单词。通过构建二叉搜索树来存储单词及其出现次数，并利用快速排序算法对单词按频率进行排序。

完整工程

https://download.youkuaiyun.com/download/renzemingcsdn/21378979

需求

在这个作业中，你将从一个大文本文件中找到100个最经常出现的单词。程序必须用C语言实现。一个连续的字符串A… z . A…Zz，可能带有撇号’，被认为是一个单词。大写字母和小写字母被认为是相等的。程序应打印100个最经常出现的单词及其频率。这些单词按其频率的降序排列因为文件可能非常大，所以需要合适的数据结构来存储单词及其频率。例如，它可以是一个哈希表或二叉搜索树。您可以通过使用一些快速算法对结构进行排序来找到最频繁的单词。
在这里插入图片描述

原理

采用二叉搜索树统计频率，快速排序找到最多的单词。

//https://www.cnblogs.com/landpack/p/4787004.html
//http://data.biancheng.net/view/71.html

实现

main.c中，读取文件，每读取一个单词，将其插入二叉搜索树，而后进行快速排序，最后输出单词。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "readtxt.h"
#include "binarytree.h"

#define INFILE "./Test/Bulk.txt"
#define OUTFILE "./outtest.txt"

int total_num=0;
int main()
{
    //printf("Hello world!\n");
    FILE *fp_in, *fp_out;
    char str[LONGEST_WORD];
    if((fp_in=fopen(INFILE,"r"))==NULL){
        printf("\nCannot open in file strike any key exit!");
        return -2;
    }
    if((fp_out=fopen(OUTFILE,"w"))==NULL){
        printf("\nCannot open out file strike any key exit!");
        return -3;
    }
    node * root;root = NULL;
    while(getWord(fp_in,str)==1){
        total_num++;
        insert(&root,str);
    }
    QSort(words,1,total_difrent_num);

    fputs( "Total number of words = ", fp_out);
    fprintf(fp_out , "%d\n" , total_num );

    fputs( "Number of different words = ", fp_out);
    fprintf(fp_out , "%d\n" , total_difrent_num );

    fputs( "The 100 most common words:\nWORD            NUMBER OF OCCURRENCES\n",fp_out);

    for(int i=1;i<=100;i++){
        fputs( words[i]->str, fp_out);
        fputs( "             ", fp_out);
        fprintf(fp_out , "%d\n" , words[i]->count );
        //printf("[%s]count:[%d]\n",words[i]->str,words[i]->count);
    }
    //deltree(root);
    fclose(fp_out);
    fclose(fp_in);
    return 0;
}

binary_tree,h
有个测试用例中，单词长度达到69，所以字符串开大一点

#ifndef BINARYTREE_H_INCLUDED
#define BINARYTREE_H_INCLUDED
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LONGEST_WORD  73    // The longest word size
struct binary_tree {
    char str[LONGEST_WORD];
    int count;
    struct binary_tree * left;
    struct binary_tree * right;
};
typedef struct binary_tree node;
extern int total_difrent_num;
extern node* words[];
int cmp(char * s, char * t);
void insert(node ** tree, char * val);
void deltree(node * tree);
void print_inorder(node * tree);
#endif // BINARYTREE_H_INCLUDED

binarytree.c 注意比较函数的写法，返回0，1，-1分别代表相等、大于、小于。

#include "binarytree.h"
int total_difrent_num=0;
node* words[3000000];
int cmp(char * s, char * t)
{
    int i=0;
    while(t[i]!='\0' && s[i]!='\0' && s[i]==t[i]){
        i++;
    }
    if(s[i]==t[i])//都结束了
        return 0;
    if(s[i]<t[i])return -1;
    return 1;
}

void insert(node ** tree, char * val) {
    node * temp = NULL;
    if(!(*tree)) {
        temp = (node*)malloc(sizeof(node));
        temp->left = temp->right = NULL;
        strcpy(temp->str , val);
        temp->count = 1;
        *tree = temp;
        total_difrent_num++;
        words[total_difrent_num]=temp;
        return ;
    }
    int ret=cmp(val, (*tree)->str);
    if(ret==1) {//val大于str
        insert(&(*tree)->right,val);
    }else if (ret==-1) {//str大于val
        insert(&(*tree)->left,val);
    }else{
        (*tree)->count++;
    }
}
void deltree(node * tree) {
    if(tree) {
        deltree(tree->left);
        deltree(tree->right);
        free(tree);
    }
}
//中序遍历
void print_inorder(node * tree) {
    printf("In Order Display\n");
    if(tree) {
        print_inorder(tree->left);
        printf("[%s\t\t\t]count:[%d]\n",tree->str,tree->count);
        print_inorder(tree->right);
    }
}

quick_sort.h

#ifndef QUICKSORT_H_INCLUDED
#define QUICKSORT_H_INCLUDED
#include "binarytree.h"
void swap(node **a,node **b);
#endif // QUICKSORT_H_INCLUDED

quicksort.c 注意交换函数，交换的是words[]中的指针，不要交换node数据，不然开销太大。也就是，依据node->count排序，交换node*，数据始终不动

#include <stdio.h>
#include <stdlib.h>

#include "binarytree.h"
#include "quicksort.h"
void swap(node **a,node **b){
    node* tmp=*a;
    *a=*b;
    *b=tmp;
}
//此方法中，存储记录的数组中，下标为 0 的位置没有数据，不放任何记录，记录从下标为 1 处开始依次存放
int Partition(node **L,int low,int high){
    L[0]=L[low];//零位置是缓冲区
    int pivotkey=(*L[low]).count;//目标值
    while (low<high) {//直到两指针相遇，程序结束
        while (low<high && (*L[high]).count<=pivotkey) {
            high--;//high指针左移，直至遇到比pivotkey值小的记录，指针停止移动
        }
        L[low]=L[high];//直接将high指向的小于支点的记录移动到low指针的位置。

        while (low<high && (*L[low]).count>=pivotkey) {
            low++;//low 指针右移，直至遇到比pivotkey值大的记录，指针停止移动
        }
        L[high]=L[low];//直接将low指向的大于支点的记录移动到high指针的位置
    }
    L[low]=L[0];//将支点添加到准确的位置
    return low;
}
void QSort(node *L,int low,int high){
    if (low<high) {
        int pivotloc=Partition(L, low, high);//找到支点的位置
        QSort(L, low, pivotloc-1);//对支点左侧的子表进行排序
        QSort(L, pivotloc+1, high);//对支点右侧的子表进行排序
    }
}