ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1000209,) + inhomogeneous part.import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import mean_absolute_error, ndcg_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Concatenate, Reshape, Multiply, Activation, GlobalAveragePooling1D, Bidirectional, LSTM, MultiHeadAttention
# 更安全地提取电影名
def extract_title_year(title):
pattern = re.compile(r'^(.*?)\s*$(\d+)$')
match = pattern.match(title)
if match:
return match.group(1).strip()
else:
return title.strip()
# 加载数据
def load_data():
# 用户数据
users = pd.read_csv('/kaggle/input/tuijian/ml-1m/users.dat', sep='::', header=None,
names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python',
encoding='ISO-8859-1')
users = users[['UserID', 'Gender', 'Age', 'Occupation']]
users['UserID'] = users['UserID'].astype(int)
gender_map = {'F': 0, 'M': 1}
users['Gender'] = users['Gender'].map(gender_map)
age_map = {var: ii for ii, var in enumerate(set(users['Age']))}
users['Age'] = users['Age'].map(age_map)
# 电影数据
movies = pd.read_csv('/kaggle/input/tuijian/ml-1m/movies.dat', sep='::', header=None,
names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
movies['MovieID'] = movies['MovieID'].astype(int)
movies['TitleWithoutYear'] = movies['Title'].apply(extract_title_year)
# 处理类型
genre_set = set()
for var in movies['Genres'].str.split('|'):
genre_set.update(var)
genre_int_map = {var: ii for ii, var in enumerate(genre_set)}
movies['GenresMultiHot'] = movies['Genres'].apply(
lambda x: [genre_int_map[genre] for genre in x.split('|')])
movies['GenresMultiHot'] = movies['GenresMultiHot'].apply(
lambda x: to_categorical(x, num_classes=len(genre_int_map)))
# 处理标题
word_set = set()
for var in movies['TitleWithoutYear'].str.split():
word_set.update(var)
word_int_map = {var: ii + 1 for ii, var in enumerate(word_set)}
movies['TitleIndex'] = movies['TitleWithoutYear'].apply(
lambda x: [word_int_map[word] for word in x.split() if word in word_int_map])
movies['TitleIndex'] = movies['TitleIndex'].apply(
lambda x: pad_sequences([x], maxlen=15, padding='post', value=0)[0])
# 评分数据
ratings = pd.read_csv('/kaggle/input/tuijian/ml-1m/ratings.dat', sep='::', header=None,
names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python',
encoding='ISO-8859-1')
ratings['UserID'] = ratings['UserID'].astype(int)
ratings['MovieID'] = ratings['MovieID'].astype(int)
ratings = ratings[['UserID', 'MovieID', 'Rating']]
# 合并数据
data = pd.merge(pd.merge(users, ratings), movies)
features = data.drop(['Rating'], axis=1)
targets = data[['Rating']]
return features, targets, genre_int_map, word_int_map
# 加载并预处理数据
features, targets, genre_int_map, word_int_map = load_data()
# 超参数
USER_ID_COUNT = 6040
MOVIE_ID_COUNT = 3952
GENRE_COUNT = len(genre_int_map)
TITLE_WORD_COUNT = len(word_int_map) + 1
EMBED_DIM = 64
LSTM_UNIT_NUM = 64
DROPOUT_RATE = 0.3
# 用户特征网络
def user_feature_network(user_id, user_gender, user_age, user_occupation):
user_id_embed = Embedding(USER_ID_COUNT, EMBED_DIM)(user_id)
user_gender_embed = Embedding(2, EMBED_DIM // 2)(user_gender)
user_age_embed = Embedding(7, EMBED_DIM // 2)(user_age)
user_occupation_embed = Embedding(21, EMBED_DIM // 2)(user_occupation)
user_id_embed = Reshape((EMBED_DIM,))(user_id_embed)
user_gender_embed = Reshape((EMBED_DIM // 2,))(user_gender_embed)
user_age_embed = Reshape((EMBED_DIM // 2,))(user_age_embed)
user_occupation_embed = Reshape((EMBED_DIM // 2,))(user_occupation_embed)
user_combine = Concatenate()([user_id_embed, user_gender_embed, user_age_embed, user_occupation_embed])
user_dense = Dense(128, activation='relu')(user_combine)
return user_dense
# 电影特征网络
def movie_feature_network(movie_id, movie_genres, movie_titles):
movie_id_embed = Embedding(MOVIE_ID_COUNT, EMBED_DIM)(movie_id)
movie_genres_embed = Dense(EMBED_DIM, activation='relu')(movie_genres)
movie_title_embed = Embedding(TITLE_WORD_COUNT, EMBED_DIM)(movie_titles)
transformer_output = MultiHeadAttention(num_heads=2, key_dim=EMBED_DIM)(movie_title_embed, movie_title_embed)
transformer_output = GlobalAveragePooling1D()(transformer_output)
movie_combine = Concatenate()([movie_id_embed, movie_genres_embed, transformer_output])
movie_dense = Dense(128, activation='relu')(movie_combine)
return movie_dense
# 注意力交互模块
def attention_interaction(user_feat, movie_feat):
attention_score = tf.keras.layers.Dot(axes=-1)([user_feat, movie_feat])
attention_score = Reshape((1,))(attention_score)
attention_score = Activation('softmax')(attention_score)
return attention_score
# 构建模型
def build_model():
user_id = Input(shape=(1,), name='user_id')
user_gender = Input(shape=(1,), name='user_gender')
user_age = Input(shape=(1,), name='user_age')
user_occupation = Input(shape=(1,), name='user_occupation')
movie_id = Input(shape=(1,), name='movie_id')
movie_genres = Input(shape=(GENRE_COUNT,), name='movie_genres')
movie_titles = Input(shape=(15,), name='movie_titles')
user_feat = user_feature_network(user_id, user_gender, user_age, user_occupation)
movie_feat = movie_feature_network(movie_id, movie_genres, movie_titles)
attention_score = attention_interaction(user_feat, movie_feat)
interaction = Multiply()([user_feat, attention_score])
combined = Concatenate()([interaction, movie_feat])
output = Dense(1, activation='sigmoid', name='predicted_rating')(combined)
model = Model(
inputs=[user_id, user_gender, user_age, user_occupation,
movie_id, movie_genres, movie_titles],
outputs=output
)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
return model
# 准备数据
user_id_arr = np.array(features['UserID']).reshape(-1, 1)
user_gender_arr = np.array(features['Gender']).reshape(-1, 1)
user_age_arr = np.array(features['Age']).reshape(-1, 1)
user_occupation_arr = np.array(features['Occupation']).reshape(-1, 1)
movie_id_arr = np.array(features['MovieID']).reshape(-1, 1)
movie_genres_arr = np.array(features['GenresMultiHot'].tolist())
movie_titles_arr = np.array(features['TitleIndex'].tolist())
ratings_arr = np.array(targets['Rating']).reshape(-1, 1)
# 划分训练集和测试集
X = [user_id_arr, user_gender_arr, user_age_arr, user_occupation_arr,
movie_id_arr, movie_genres_arr, movie_titles_arr]
y = ratings_arr
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 构建并训练模型
model = build_model()
model.summary()
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_split=0.2)
# 评估
loss, mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test MAE: {mae:.4f}')
# 生成预测
predictions = model.predict(X_test)
# Hit Rate @ K
def calculate_hit_rate(predictions, actuals, top_k=10):
hit_count = 0
user_dict = {}
for i, (pred, act) in enumerate(zip(predictions, actuals)):
user_id = X_test[0][i][0]
if user_id not in user_dict:
user_dict[user_id] = []
user_dict[user_id].append((pred[0], act[0]))
for user in user_dict:
items = sorted(user_dict[user], key=lambda x: x[0], reverse=True)[:top_k]
top_k_items = [item[1] for item in items]
if any([act >= 4 for act in top_k_items]):
hit_count += 1
return hit_count / len(user_dict)
# NDCG @ K
def calculate_ndcg(predictions, actuals, top_k=10):
user_dict = {}
for i, (pred, act) in enumerate(zip(predictions, actuals)):
user_id = X_test[0][i][0]
if user_id not in user_dict:
user_dict[user_id] = []
user_dict[user_id].append((pred[0], act[0]))
ndcg_scores = []
for user in user_dict:
preds, acts = zip(*user_dict[user])
preds = np.array(preds).reshape(1, -1)
acts = np.array(acts).reshape(1, -1)
idxs = np.argsort(-preds[0])[:top_k]
ndcg = ndcg_score(acts[:, idxs], preds[:, idxs], k=top_k)
ndcg_scores.append(ndcg)
return np.mean(ndcg_scores)
# 计算评估指标
hit_rate = calculate_hit_rate(predictions, y_test, top_k=10)
ndcg_at_k = calculate_ndcg(predictions, y_test, top_k=10)
print(f'Test HR@10: {hit_rate:.4f}')
print(f'Test NDCG@10: {ndcg_at_k:.4f}')
最新发布