Load data

import pandas as pd
import numpy as np
# Import confusion_matrix and classification_report from the sklearn.metrics module
from sklearn.metrics import confusion_matrix, classification_report df = pd.read_csv('./Restaurant_Reviews.tsv', sep='\t')
print(df)

Load model word2vec pretrained

import pandas as pd
from datetime import date, timedelta
import re
from nltk.tokenize import word_tokenize
from gensim import corpora, models
from gensim.models import KeyedVectors
from gensim.matutils import corpus2dense
import gensim #Loading the word vectors from Google trained word2Vec model
GoogleModel = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

Test model

word = 'apple'
vector = GoogleModel[word]
print(vector.shape)

similar_words = GoogleModel.most_similar(word)
print(similar_words)

[('apples', 0.720359742641449), ('pear', 0.6450697183609009), ('fruit', 0.6410146355628967), ('berry', 0.6302294731140137),

Convert sang list token theo vocabulary

from keras_preprocessing.text import Tokenizer
from keras_preprocessing import sequence tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Review'])
sequences = tokenizer.texts_to_sequences(df['Review']) maxlen = 100
X = sequence.pad_sequences(sequences, maxlen=maxlen)
print('X:\n', X[:10])

Tạo ember maxtrix

import numpy as np embedding_dim = 300
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, len(GoogleModel.index_to_key))
embedding_matrix = np.zeros((num_words, embedding_dim)) print('num_words:', num_words)
for word, i in word_index.items(): if i >= num_words: continue if word in GoogleModel.index_to_key: embedding_matrix[i] = GoogleModel.word_vec(word)

Tạo model train

from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(maxlen, ))))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_26"
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding_26 (Embedding) (None, 100, 300) 621600 bidirectional_8 (Bidirectio (None, 100, 128) 186880 nal) flatten_14 (Flatten) (None, 12800) 0 dense_17 (Dense) (None, 1) 12801 =================================================================
Total params: 821,281
Trainable params: 199,681
Non-trainable params: 621,600
_________________________________________________________________

Tạo data test


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df['Liked'], test_size=0.3, random_state=101)

Train thôi.

model.fit(X_train, y_train, epochs=10, batch_size=32)

Test độ chính xác nồ

predictions = model.predict(X_test)
print('predict:', predictions[:3])
predictions = np.round(predictions)
print('predictions:', predictions.flatten())
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

Done, code cứ thế mà run thôi , hy vọng sẽ giúp ích được cho mọi người . Thank for reading ,

Dùng word2vec + LSTM trong bài toán classify thực tế, version example code (ít lý thuyết) :)

Load data

Load model word2vec pretrained

Test model

Convert sang list token theo vocabulary

Tạo ember maxtrix

Tạo model train

Tạo data test

Train thôi.

Test độ chính xác nồ

Done, code cứ thế mà run thôi , hy vọng sẽ giúp ích được cho mọi người . Thank for reading ,

Bình luận

Bài viết tương tự

Epoch, Batch size và Iterations

YOLOv2: Tốt hơn, nhanh hơn và mạnh mẽ hơn

Giới thiệu về Diffussion model (series 2)

Introduction Backpropagation ANN(Series 1)

Introduction backpropagation RNN and LSTM(Series 3)

Khám phá sức mạnh của cơ chế Self Attention trong Transformers