using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace ConsoleApplication1
{
/*
*
* Problem 2: (Please write this one in C# if you know C#)
* Write a program that counts word occurrences in a directory of
* UTF-8-encoded text files and outputs a file (UTF-8) containing a list
* sorted (a) by frequency and (b) alphabetically within the same
* frequency. Words should be normalized to lower case
* (i.e., "Hat", "hAt" and "HAT" should all be normalized to "hat").
*
* You can assume words are space-separated tokens.
*
* The directory may contain sub directories and we need to search through
* all sub directories. Also, the directory may contain different type of files,
* but we should only process files with extension ".txt".
*
* Assume punctuations : , . !
*/
class Program
{
static void Main(string[] args)
{
string inputDictPath = @"test";
string outputFilePath = @"output.txt";
Dictionary<string, int> dict = new Dictionary<string, int>();
//Recursive traversal directory, statistics, stored in $dict
WalkThroughDirectory(inputDictPath, dict);
//Sort and output to file
Output(dict, outputFilePath);
//Wait for input
System.Console.ReadLine();
}
/// <summary>
/// Recursive traversal directory to process files with suffix ".txt"
/// </summary>
/// <param name="directory">path of directory</param>
/// <param name="dict">directory to store "wordscount"</param>
static void WalkThroughDirectory(string directory, Dictionary<string, int> dict)
{
DirectoryInfo di = new DirectoryInfo(directory);
FileInfo[] fis = di.GetFiles();
for (int i = 0; i < fis.Length; i++)
{
// Only process the files with suffix ".txt"
if (fis[i].FullName.EndsWith(".txt"))
{
CountWords(fis[i].FullName, dict);
}
}
DirectoryInfo[] dis = di.GetDirectories();
for (int j = 0; j < dis.Length; j++)
{
WalkThroughDirectory(dis[j].FullName, dict);
}
}
/// Count word occurrences in a directory of UTF-8-encoded text files
static void CountWords(string filepath, Dictionary<string, int> dict)
{
FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read);
StreamReader sr = new StreamReader(fs, Encoding.UTF8);
string txt;
while ((txt = sr.ReadLine()) != null)
{
//Replace "," "." and "!" with " "
txt = txt.Replace(",", " ").Replace(".", " ").Replace("!", " ");
string[] txtsp = txt.Split(' ');
for (int i = 0; i < txtsp.Length; i++)
{
//Skip when 0-length
if (txtsp[i].Length == 0)
{
continue;
}
string word = txtsp[i].ToLower();
if (dict.ContainsKey(word))
{
dict[word]++;
}
else
{
dict.Add(word, 1);
}
}
}
fs.Close();
sr.Close();
}
/// Outputs a file (UTF-8) containing a list
/// sorted (a) by frequency and (b) alphabetically within the same frequency.
static void Output(Dictionary<string, int> dict, string outputFilePath)
{
//If the file exists, overwrite it.
StreamWriter sw = new StreamWriter(
outputFilePath, false, Encoding.GetEncoding("utf-8"));
//Sorted (a) by frequency and (b) alphabetically within the same frequency.
dict = dict.OrderByDescending(p => p.Value).ThenBy(p => p.Key).ToDictionary(p => p.Key, p => p.Value);
foreach (KeyValuePair<string, int> kvp in dict)
{
sw.WriteLine("{0} {1}", kvp.Key, kvp.Value);
//For debug
Console.WriteLine("{0} {1}", kvp.Key, kvp.Value);
}
sw.Close();
}
}
}
词频统计
最新推荐文章于 2022-03-21 19:46:40 发布