rendered paste body# -*- coding: utf-8 -*-
import MeCab
import sys
import operator
import pyexcel as pe
test = open('/home/sean/Bureau/etudies programa/pythonjp/mishima', 'r').read()
t = MeCab.Tagger (" ".join(sys.argv))
#lista com as frases
sentences = test.split('。')
t.parse(test)
dicio = {}
m = t.parseToNode(test)
pyjp_current = 0 #contadordasfrases
def queisso(meca, dados):
#separa as caracteristicas da palavra numa lista
#Original Form\t, Part of Speech, Part of Speech section 1, Part of Speech section 2, Part of Speech section 3, Conjugated form, Inflection, Reading, Pronounciation
carac = meca.feature.split(',')
#termo na forma de dicionário (conjugated form)
termo = carac[6]
#tirar ponto, vírgula, (binding particle 係助詞), (case making particle), ( n sei oq é 同じ 大きな *), (partícula conjuntiva 接続助詞), (paralel marker と や 並立助詞), (fecha parenteses 括弧閉), (abre parenteses 括弧開)
if carac[1] in ('句点', '読点', '係助詞', '格助詞', '連体化', '*', '接続助詞', '並立助詞', '括弧閉', '括弧開'):
if carac[1] == '句点':
global pyjp_current
pyjp_current = pyjp_current + 1
else:
return 1
elif termo not in dados:
dados[termo] = [1, sentences[pyjp_current].replace('\n', '').replace("\u3000", '')]
#if carac[1] == '':
# print (termo)
else:
dados[termo][0]= dados[termo][0] + 1
dados[termo].append(sentences[pyjp_current].replace('\n', '').replace("\u3000", ''))
def arrumaesalva(dic):
tab = sorted(dic.items(), key=operator.itemgetter(1), reverse=True)
print (type(tab[0]))
sheet = pe.Sheet(tab)
sheet.save_as("/home/sean/Bureau/etudies programa/pythonjp/mishima.csv")
while m:
queisso(m, dicio)
#print(m.id, " \t", m.feature)
#print(dir(m))
m = m.next
print(dicio)
arrumaesalva(dicio)