
Go语言词频统计
发布时间:2022-07-08 15:13:27
它是文本分析最基本的一种形式:统计出一个文件里单词出现的频率。
示例中频率统计后的结果以两种不同的方式显示,一种是将单词按照字母顺序把单词和频率排列出来,另一种是按照有序列表的方式把频率和对应的单词显示出来,完整的示例代码如下所示:
package main
import (
"bufio"
"fmt"
"io"
"log"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"unicode"
"unicode/utf8"
)
func main() {
if len(os.Args) == 1 || os.Args[1] == "-h" || os.Args[1] == "--help" {
fmt.Printf("usage: %s <file1> [<file2> [... <fileN>]]\n",
filepath.Base(os.Args[0]))
os.Exit(1)
}
frequencyForWord := map[string]int{} // 与:make(map[string]int)相同
for _, filename := range commandLineFiles(os.Args[1:]) {
updateFrequencies(filename, frequencyForWord)
}
reportByWords(frequencyForWord)
wordsForFrequency := invertStringIntMap(frequencyForWord)
reportByFrequency(wordsForFrequency)
}
func commandLineFiles(files []string) []string {
if runtime.GOOS == "windows" {
args := make([]string, 0, len(files))
for _, name := range files {
if matches, err := filepath.Glob(name); err != nil {
args = append(args, name) // 无效模式
} else if matches != nil {
args = append(args, matches...)
}
}
return args
}
return files
}
func updateFrequencies(filename string, frequencyForWord map[string]int) {
var file *os.File
var err error
if file, err = os.Open(filename); err != nil {
log.Println("failed to open the file: ", err)
return
}
defer file.Close()
readAndUpdateFrequencies(bufio.NewReader(file), frequencyForWord)
}
func readAndUpdateFrequencies(reader *bufio.Reader,
frequencyForWord map[string]int) {
for {
line, err := reader.ReadString('\n')
for _, word := range SplitOnNonLetters(strings.TrimSpace(line)) {
if len(word) > utf8.UTFMax ||
utf8.RuneCountInString(word) > 1 {
frequencyForWord[strings.ToLower(word)] += 1
}
}
if err != nil {
if err != io.EOF {
log.Println("failed to finish reading the file: ", err)
}
break
}
}
}
func SplitOnNonLetters(s string) []string {
notALetter := func(char rune) bool { return !unicode.IsLetter(char) }
return strings.FieldsFunc(s, notALetter)
}
func invertStringIntMap(intForString map[string]int) map[int][]string {
stringsForInt := make(map[int][]string, len(intForString))
for key, value := range intForString {
stringsForInt[value] = append(stringsForInt[value], key)
}
return stringsForInt
}
func reportByWords(frequencyForWord map[string]int) {
words := make([]string, 0, len(frequencyForWord))
wordWidth, frequencyWidth := 0, 0
for word, frequency := range frequencyForWord {
words = append(words, word)
if width := utf8.RuneCountInString(word); width > wordWidth {
wordWidth = width
}
if width := len(fmt.Sprint(frequency)); width > frequencyWidth {
frequencyWidth = width
}
}
sort.Strings(words)
gap := wordWidth + frequencyWidth - len("Word") - len("Frequency")
fmt.Printf("Word %*s%s\n", gap, " ", "Frequency")
for _, word := range words {
fmt.Printf("%-*s %*d\n", wordWidth, word, frequencyWidth,
frequencyForWord[word])
}
}
func reportByFrequency(wordsForFrequency map[int][]string) {
frequencies := make([]int, 0, len(wordsForFrequency))
for frequency := range wordsForFrequency {
frequencies = append(frequencies, frequency)
}
sort.Ints(frequencies)
width := len(fmt.Sprint(frequencies[len(frequencies)-1]))
fmt.Println("Frequency → Words")
for _, frequency := range frequencies {
words := wordsForFrequency[frequency]
sort.Strings(words)
fmt.Printf("%*d %s\n", width, frequency, strings.Join(words, ", "))
}
}
结果:
% go run word-analysis.go ./word-analysis.txt
Word Frequency
about 1
blog 1
encoded 1
for 1
functions 1
go 1
golang 1
https 1
implements 1
in 1
information 1
manipulate 1
org 1
package 1
see 1
simple 1
strings 4
testing 1
to 1
utf 2
Frequency → Words
1 about, blog, encoded, for, functions, go, golang, https, implements, in, information, manipulate, org, package, see, simple, testing, to
2 utf
4 strings