当前位置：首页> 正文

word2vec中的bin文件转换为txt 文件-BIN文件

from gensim.models import word2vec
 
model = word2vec.Word2Vec.load_word2vec_format('/home/ubuntu/word2vec/PubMed-w2v.bin', binary=True)
model.save_word2vec_format('/home/ubuntu/word2vec/PubMed-w2v.txt', binary=False)

运行代码时提醒错误：

word2vec中的bin文件转换为txt 文件

作为轻度强迫症的我，看到这个UserWarning 极为不爽快，于是就安装 Pattern, 也是各种安装不成功，网上搜索原因的时候发现：python2.x 才支持Pattern ，而我用的是python3.5.2, 不能因为个Warning退回Python2.x 于是我忍了。果然只是轻度强迫症

第二种方法（与第一种大同小异，不过也记录一下）


from gensim.models import word2vec
 
model = word2vec.Word2Vec.load_word2vec_format('Path/to/GoogleNews-vectors-negative300.bin', binary=True)
model.save("file.txt")

第三种方法（其实都一样啦）
import codecs
from gensim.models import Word2Vec
 
def main():
 path_to_model = 'GoogleNews-vectors-negative300.bin'
 output_file = 'GoogleNews-vectors-negative300_test.txt'
 export_to_file(path_to_model, output_file)
 
 
def export_to_file(path_to_model, output_file):
 output = codecs.open(output_file, 'w' , 'utf-8')
 model = Word2Vec.load_word2vec_format(path_to_model, binary=True)
 print('done loading Word2Vec')
 vocab = model.vocab
 for mid in vocab:
 #print(model[mid])
 #print(mid)
 vector = list()
 for dimension in model[mid]:
 vector.append(str(dimension))
 #line = { "mid": mid, "vector": vector }
 vector_str = ",".join(vector)
 line = mid + "\t" + vector_str
 #line = json.dumps(line)
 output.write(line + "\n")
 output.close()
 
if __name__ == "__main__":
 main()