d2l-ai · KuanHaoHuang · Aug 6, 2023 · Aug 13, 2023
diff --git a/chapter_recommender-systems/movielens.md b/chapter_recommender-systems/movielens.md
@@ -18,10 +18,18 @@ import os
 import pandas as pd
 ```
 
+```{.python .input}
+#@tab pytorch
+from d2l import torch as d2l
+import torch
+import os
+import pandas as pd
+```
+
 Then, we download the MovieLens 100k dataset and load the interactions as `DataFrame`.
 
 ```{.python .input  n=2}
-#@tab mxnet
+#@tab all
 #@save
 d2l.DATA_HUB['ml-100k'] = (
     'https://files.grouplens.org/datasets/movielens/ml-100k.zip',
@@ -43,7 +51,7 @@ def read_data_ml100k():
 Let's load up the data and inspect the first five records manually. It is an effective way to learn the data structure and verify that they have been loaded properly.
 
 ```{.python .input  n=3}
-#@tab mxnet
+#@tab all
 data, num_users, num_items = read_data_ml100k()
 sparsity = 1 - len(data) / (num_users * num_items)
 print(f'number of users: {num_users}, number of items: {num_items}')
@@ -56,7 +64,7 @@ We can see that each line consists of four columns, including "user id" 1-943, "
 We then plot the distribution of the count of different ratings. As expected, it appears to be a normal distribution, with most ratings centered at 3-4.
 
 ```{.python .input  n=4}
-#@tab mxnet
+#@tab all
 d2l.plt.hist(data['rating'], bins=5, ec='black')
 d2l.plt.xlabel('Rating')
 d2l.plt.ylabel('Count')
@@ -95,6 +103,33 @@ def split_data_ml100k(data, num_users, num_items,
     return train_data, test_data
 ```
 
+```{.python .input}
+#@tab pytorch
+#@save
+def split_data_ml100k(data, num_users, num_items,
+                      split_mode='random', test_ratio=0.1):
+    """Split the dataset in random mode or seq-aware mode."""
+    if split_mode == 'seq-aware':
+        train_items, test_items, train_list = {}, {}, []
+        for line in data.itertuples():
+            u, i, rating, time = line[1], line[2], line[3], line[4]
+            train_items.setdefault(u, []).append((u, i, rating, time))
+            if u not in test_items or test_items[u][-1] < time:
+                test_items[u] = (i, rating, time)
+        for u in range(1, num_users + 1):
+            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
+        test_data = [(key, *value) for key, value in test_items.items()]
+        train_data = [item for item in train_list if item not in test_data]
+        train_data = pd.DataFrame(train_data)
+        test_data = pd.DataFrame(test_data)
+    else:
+        mask = [True if x == 1 else False for x in torch.rand(
+            (len(data))) < 1 - test_ratio]
+        neg_mask = [not x for x in mask]
+        train_data, test_data = data[mask], data[neg_mask]
+    return train_data, test_data
+```
+
 Note that it is good practice to use a validation set in practice, apart from only a test set. However, we omit that for the sake of brevity. In this case, our test set can be regarded as our held-out validation set.
 
 ## Loading the data
@@ -120,6 +155,25 @@ def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
     return users, items, scores, inter
 ```
 
+```{.python .input}
+#@tab pytorch
+#@save
+def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
+    users, items, scores = [], [], []
+    inter = torch.zeros((num_items, num_users)) if feedback == 'explicit' else {}
+    for line in data.itertuples():
+        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
+        score = int(line[3]) if feedback == 'explicit' else 1
+        users.append(user_index)
+        items.append(item_index)
+        scores.append(score)
+        if feedback == 'implicit':
+            inter.setdefault(user_index, []).append(item_index)
+        else:
+            inter[item_index, user_index] = score
+    return users, items, scores, inter
+```
+
 Afterwards, we put the above steps together and it will be used in the next section. The results are wrapped with `Dataset` and `DataLoader`. Note that the `last_batch` of `DataLoader` for training data is set to the `rollover` mode (The remaining samples are rolled over to the next epoch.) and orders are shuffled.
 
 ```{.python .input  n=7}
@@ -146,6 +200,41 @@ def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
     return num_users, num_items, train_iter, test_iter
 ```
 
+```{.python .input}
+#@tab pytorch
+#@save
+def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit',
+                          test_ratio=0.1, batch_size=256):
+    data, num_users, num_items = read_data_ml100k()
+    train_data, test_data = split_data_ml100k(
+        data, num_users, num_items, split_mode, test_ratio)
+    train_u, train_i, train_r, _ = load_data_ml100k(
+        train_data, num_users, num_items, feedback)
+    test_u, test_i, test_r, _ = load_data_ml100k(
+        test_data, num_users, num_items, feedback)
+
+    class ML100KDataset(torch.utils.data.Dataset):
+        def __init__(self, users, items, ratings):
+            assert len(users) == len(items) == len(ratings)
+            self.users = users
+            self.items = items
+            self.ratings = ratings
+
+        def __getitem__(self, index):
+            return (self.users[index], self.items[index],
+                    self.ratings[index])
+
+        def __len__(self):
+            return len(self.users)
+
+    train_set = ML100KDataset(train_u, train_i, train_r)
+    test_set = ML100KDataset(test_u, test_i, test_r)
+    train_iter = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True,
+                                      drop_last=True)
+    test_iter = torch.utils.data.DataLoader(test_set, batch_size)
+    return num_users, num_items, train_iter, test_iter
+```
+
 ## Summary
 
 * MovieLens datasets are widely used for recommendation research. It is public available and free to use.