f = open('1998-01-2003版-带音.txt',encoding='utf-8')
line = f.readline()

word_dict = dict()
#word_dict['<BOS>'] = 0
#word_dict['<EOS>'] = 0

#统计二元文法
word2_dict = dict()

word_sum = 0

def addtodict(d,w):
    if w not in d:
        d[w] = 1
    else:
        d[w] += 1

#判断词语片是否为句子分隔符如[，/w ]
def isDivOfSentense(str):
    if str[-2:]=='/w':
        return True
    else:
        return False

begin = True

while line:
    #去除开始的日期
    data = line.split('  ')[1:]
    previous = ''#二元文法，统计上一次出现的字符
    for i in data:
        i = i.strip(' ')
        if i == '\n':
            begin = True
            continue
        #w是word去除后面的词性的版本
        w = i.split('/')[0]
        #到达<EOS>
        if isDivOfSentense(i):
            #忽略<BOS><EOS>的情况
            if begin == True:
                continue

            if previous != '':
                #前一个词语不是空白，统计[previous,w]的次数

                addtodict(word2_dict,previous +' '+ '<EOS>')

            #word_dict['<EOS>'] += 1
            #word_sum += 1
            begin = True
            continue

        #一句的开头，而且下一个是正常的词
        if begin == True:
            #word_dict['<BOS>'] += 1
            #word_sum += 1
            begin = False
            previous = '<BOS>'

        word_sum += 1
        if previous != '':
            #前一个词语不是空白，统计[previous,i]的次数
            addtodict(word2_dict,previous +' '+  w)

        if w not in word_dict:
            word_dict[w] = 1
        else:
            word_dict[w] += 1
        previous = w
    
    line = f.readline()
    
word_dict_sorted= sorted(word_dict.items(), key=lambda d:d[1], reverse = True)  
word2_dict_sorted= sorted(word2_dict.items(), key=lambda d:d[1], reverse = True)  

out = open('3_word1.csv','w')
out_py = open('dict_3.py','w',encoding='utf-8')
out_py.write('#!/usr/bin/python3\n')
out_py.write('word_sum = '+str(word_sum)+'\n')

out_py.write('word_dict = ')
print(word_dict,file=out_py)
for ch in word_dict_sorted:
    out.write(ch[0]+','+ str(ch[1])+','+str(float(ch[1]) / word_sum)+'\n')

out.close()

out = open('3_word2.csv','w')
out_py.write('word2_dict = ')
print(word2_dict,file=out_py)

for ch in word2_dict_sorted:
    
    out.write(ch[0]+','+ str(ch[1])+'\n')

out.close()
out_py.close()

f.close()

print(word_dict)