In [9]:
# 初始化
! [ -e /content ] && pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *

In [10]:
# 加载数据
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [11]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [12]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [13]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,374,"Jungle Book, The (1994)",5
1,503,Star Wars (1977),5
2,343,Emma (1996),4
3,263,"Terminator, The (1984)",5
4,409,2001: A Space Odyssey (1968),5
5,711,That Thing You Do! (1996),4
6,471,Matilda (1996),5
7,389,North by Northwest (1959),5
8,313,"Devil's Own, The (1997)",3
9,821,"Cry, the Beloved Country (1995)",5


In [14]:
n_users  = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [15]:
# 介绍用一位有效编码向量替换我们的索引
one_hot_3 = one_hot(3, n_users).float()

In [16]:
user_factors.t() @ one_hot_3

tensor([ 0.5017,  0.5266, -1.4344, -1.1046, -0.0247])

In [17]:
user_factors[3]

tensor([ 0.5017,  0.5266, -1.4344, -1.1046, -0.0247])

In [18]:
# 构建协同过滤的模型
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range

    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [19]:
# 一捆数据的形状
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [20]:
# 构建学习器
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [21]:
# 训练
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.964772,0.93565,00:16
1,0.835271,0.863038,00:13
2,0.602094,0.863331,00:13
3,0.406735,0.889156,00:13
4,0.282856,0.8962,00:12


In [22]:
# 引入了权值衰减
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))

In [23]:
# 重新训练
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.950799,0.928725,00:13
1,0.851575,0.874137,00:13
2,0.72029,0.834112,00:13
3,0.607391,0.81743,00:12
4,0.505506,0.818317,00:12


In [24]:
# 模型的结构
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [31]:
# 深度学习中的协同过滤
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range

    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [34]:
embs = get_emb_sz(dls)
model = CollabNN(*embs)

In [35]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.956142,0.956086,00:17
1,0.880769,0.905277,00:17
2,0.881123,0.87229,00:15
3,0.826901,0.863872,00:15
4,0.769603,0.867884,00:15


In [36]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.984944,0.98233,00:18
1,0.957174,0.913392,00:16
2,0.883421,0.881463,00:18
3,0.829791,0.853105,00:18
4,0.76939,0.857803,00:17
