-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathFileAlignment.py
More file actions
104 lines (78 loc) · 3.3 KB
/
FileAlignment.py
File metadata and controls
104 lines (78 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pdb
import SentenceAlign
from deep_translator import GoogleTranslator
import os
from vncorenlp import VnCoreNLP
def findBilingualFile(file_name, list_file, extension):
file = None
file = filter(lambda file_tgt: file_tgt == "{}.{}".format(file_name, extension), list_file)
return list(file)
def translateDocument(document_path, tgt_lang, annotator):
googleTranslator = GoogleTranslator(source=tgt_lang, target="vi")
list_file = os.listdir(document_path)
for viFile in filter(lambda file: "vi.txt" in file,list_file):
bilingualFile = findBilingualFile(viFile.split(".")[0], list_file, "{}.txt".format(tgt_lang))
list_source = list()
list_target = list()
f = open(os.path.join(document_path, viFile), "r", encoding="utf-8")
for line in f:
if line.strip() != "":
list_source.append(line.strip())
f.close()
f = open(os.path.join(document_path, bilingualFile[0]), "r", encoding="utf-8")
for line in f:
if line.strip() != "":
list_target.append(line.strip())
f.close()
translated_list = googleTranslator.translate_batch(list_target)
for line in AlignFileSentence(list_source, list_target, translated_list, annotator):
print(line)
def AlignFileSentence(list_origin, list_target, translated_list, annotator):
"""
gióng hàng văn bản: danh sách câu nguôn, danh sách câu gốc, danh sách câu được dịch
"""
text = ""
text_translate = ""
while (True):
lim = len(list_origin)
lim_tgt = len(list_origin)
start = 0
list_ = list()
highest_origin = 0
highest_translated = 0
scorce = 0.0
while start < lim:
start_tgt = 0
bag_of_word_origin = SentenceAlign.preprocessSentence(list_origin[start], annotator=annotator)
while start_tgt < lim_tgt:
bag_of_word_translated = SentenceAlign.preprocessSentence(translated_list[start_tgt],annotator=annotator)
scorce_sentence = SentenceAlign.TF_IDF(bag_of_word_origin, bag_of_word_translated)
if scorce < scorce_sentence:
highest_origin = start
highest_translated = start_tgt
scorce = scorce_sentence
else:
list_.append((list_origin[start], translated_list[start_tgt], scorce))
start_tgt = start_tgt + 1
start = start + 1
if scorce <= 0.2:
break
else:
yield "{}\n{}\n{} \n\n".format(list_origin[highest_origin], list_target[highest_translated],scorce)
del(list_origin[highest_origin])
del (translated_list[highest_translated])
del (list_target[highest_translated])
lim = len(list_origin)
lim_tgt = len(translated_list)
"""
f = open("thong_ke.txt", "w", encoding="utf-8")
for line in list_:
f.write("{}\t{}\t{}\n".format(line[0], line[1], line[2]))
f.close()
"""
"""
thay đổi path VnCoreNLP
Ngôn ngữ đầu vào
"""
annotator = VnCoreNLP("./VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
translateDocument("D:/NLP/Paracrawl/test", "lo", annotator)