实战演习(十一)——使用LSTM构建文本分类模型

笔者希望在平日的工作学习中，挖掘数据的价值，找寻数据的秘密，笔者认为，数据的价值不仅仅只体现在企业中，个人也可以体会到数据的魅力，用技术力量探索行为密码，让大数据助跑每一个人，欢迎直筒们关注我的公众号，大家一起讨论数据中的那些有趣的事情。

我的公众号为：livandata

由于工作中应用，需要对文本进行分类，主要是对客户的反馈信息进行分类，确定客户的反馈投诉是指的哪个部门的问题，然后自动的分发到对应的部门，由于工作数据存在一定的保密性，无法将其导入到csdn中，刚好又看到网上有一个类似的案例，就借用这个案例记录一下对应的算法模型：

这一模型主要应用的LSTM长短句模型，使用这一模型的优势是能够综合考虑句子中的语义部分，BiLSTM更是能够从反正两个方向训练语义，大大的提高了对应的训练精准度。

训练过程对应的代码如下：

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata
import os
import numpy
import re
import tarfile
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import *
from keras.layers import *
import matplotlib.pyplot as pyplot

def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

def read_files(filetype):
    path = "./aclImdb/"
    file_list = []
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    # all_labels = ([0,0,0,0,0,0,0,0,0,1] * 200 + [1,0,0,0,0,0,0,0,0,0] * 200 + [0,1,0,0,0,0,0,0,0,0] * 200 + [0,0,1,0,0,0,0,0,0,0] * 200 + [0,0,0,1,0,0,0,0,0,0] * 200 + [0,0,0,0,1,0,0,0,0,0] * 200 + [0,0,0,0,0,1,0,0,0,0] * 200 + [0,0,0,0,0,0,1,0,0,0] * 200 +[0,0,0,0,0,0,0,1,0,0] * 200 + [0,0,0,0,0,0,0,0,1,0] * 200)
    all_labels = np.zeros((2000, 10))
    all_labels[0:200, 9] = 1
    all_labels[200:400, 8] = 1
    all_labels[400:600, 7] = 1
    all_labels[600:800, 6] = 1
    all_labels[800:1000, 5] = 1
    all_labels[1000:1200, 4] = 1
    all_labels[1200:1400, 3] = 1
    all_labels[1400:1600, 2] = 1
    all_labels[1600:1800, 1] = 1
    all_labels[1800:2000, 0] = 1
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            filelines = file_input.readlines()
            all_texts += [rm_tags(filelines[0])]
    return all_labels, all_texts

# all_textsurl = "F:/python_workspace/offices/page_digging/movie_analysis/aclImdb_v1.tar.gz"
# # filepath = "aclImdb_v1.tar.gz"
# # tfile = tarfile.open(url)
# # result = tfile.extractall('.')
# 将数据按照训练和测试整理成统一对应的格式,返回的是内容和标签，
# 文本分类在数据预处理阶段相对比较简单：
y_train, x_train = read_files('train')
# print(y_train)
# <class 'numpy.ndarray'>
print(type(y_train))
print(type(x_train))

y_test, x_test = read_files('test')
# 按照文件夹分类：pos为正向情感1；neg为反向情感0：
token = Tokenizer(num_words=1000)
token.fit_on_texts(x_train)
token.fit_on_texts(x_test)
# 类似于one_hot的一种编码方式，主要是将语句序列化：
x_train_seq = token.texts_to_sequences(x_train)
x_test_seq = token.texts_to_sequences(x_test)
# 该函数是将序列转化为经过填充以后的一个长度相同的新序列：
x_train_v = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test_v = sequence.pad_sequences(x_test_seq, maxlen=100)

model = Sequential()
# embedding类似于word2vec，主要是对数据降维
# input—dim：是输入的数据句子的大小。
model.add(Embedding(input_dim=1000, output_dim=32, input_length=100))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(256, activation='relu'))
model.add(Dense(10, activation='sigmoid'))
# 呈现运算过程：
model.summary()
# 使用优化模型进行优化的过程：
model.compile(loss ='categorical_crossentropy', optimizer='adam',metrics =['accuracy'])

train_his = model.fit(x_train_v,y_train,epochs=40,batch_size=50,validation_data=(x_test_v, y_test),verbose=2,shuffle=False)
scores = model.evaluate(x_test_v, y_test)

# plot history
# epochs=50：意味着图形中返回50个loss值，可以直接使用来做图。
pyplot.plot(train_his.history['loss'], label='train')
pyplot.plot(train_his.history['val_loss'], label='test')
# 显示图例：
pyplot.legend()
pyplot.show()
print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
print(scores)

model.save('BiLSTM_analysis.model')

训练完成后需要用到对应的模型，其中应用的过程如下：

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号：livandata
import tensorflow as tf
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import re
import os

def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

def read_files(filetype):
    path = "./aclImdb/"
    file_list = []
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    print('read', filetype, 'files:', len(file_list))
    print(file_list)
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            filelines = file_input.readlines()
            all_texts += [rm_tags(filelines[0])]
    return all_texts

def pre_data():
    x_train = read_files('train')
    # 按照文件夹分类：pos为正向情感1；neg为反向情感0：
    token = Tokenizer(num_words=1000)
    token.fit_on_texts(x_train)
    # 类似于one_hot的一种编码方式，主要是将语句序列化：
    x_train_seq = token.texts_to_sequences(x_train)
    # 该函数是将序列转化为经过填充以后的一个长度相同的新序列：
    x_train_v = sequence.pad_sequences(x_train_seq, maxlen=100)
    return x_train_v

if __name__ == '__main__':
    model = tf.keras.models.load_model('BiLSTM_analysis.model')
    x_train_p = pre_data()
    print(x_train_p)
    prediction = model.predict(x_train_p)
    print(prediction)

模型的训练需要不断的调整，我只是将其应用到对应的数据集时做了训练，生成对应的模型文件，如果后面需要应用其他数据，需要重新进行训练。

对应的代码为：https://download.csdn.net/download/livan1234/11238190

更多精彩内容