{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "V100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a1qaQb1QYtP4", "outputId": "6a9074a2-7212-49e8-e050-076360ddd674" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/719.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━\u001b[0m \u001b[32m491.5/719.8 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m719.8/719.8 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m22.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hMounted at /content/gdrive\n" ] } ], "source": [ "#初始化\n", "! [ -e /content ] && pip install -Uqq fastbook\n", "import fastbook\n", "fastbook.setup_book()" ] }, { "cell_type": "code", "source": [ "from fastbook import *\n", "from IPython.display import display,HTML" ], "metadata": { "id": "9_RsicpHFHJg" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "# 加载IMDB数据集\n", "from fastai.text.all import *\n", "path = untar_data(URLs.IMDB)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 37 }, "id": "fLBe0ch9FJG0", "outputId": "12cd045f-6575-480e-f20d-d8140f23ea79" }, "execution_count": 3, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " 100.00% [144441344/144440600 00:06<00:00]\n", "
\n", " " ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "files = get_text_files(path, folders = ['train', 'test', 'unsup'])" ], "metadata": { "id": "crDZjVVyF1z-" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "txt = files[0].open().read(); txt[:75]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "XA-vpKWPGGDU", "outputId": "ede543e9-83eb-43c2-e375-98f4a4cd9ac0" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'This Movie is complete crap! Avoid this waste of celluloid at all costs, it'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "# spacy\n", "spacy = WordTokenizer()\n", "toks = first(spacy([txt]))\n", "print(coll_repr(toks, 30))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8RdqEGXtGIyn", "outputId": "1dd8251c-82ba-4227-a6d6-ad5a48a94fe6" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(#141) ['This','Movie','is','complete','crap','!','Avoid','this','waste','of','celluloid','at','all','costs',',','it','is','rambling','and','incoherent','.','I','pride','myself','on','plumbing','the','depths','of','70'...]\n" ] } ] }, { "cell_type": "code", "source": [ "first(spacy(['The U.S. dollar $1 is $1.00.']))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Vp1rjNJDGV6Y", "outputId": "b4c5a124-95b8-4fae-a6bf-fabe85b908be" }, "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(#9) ['The','U.S.','dollar','$','1','is','$','1.00','.']" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "# xxbos :: 表示文本的开始(这里,一篇评论)\n", "# xxmaj :: 表示下一个单词以大写字母开头(因为我们已经将所有内容都转换为小写)\n", "# xxunk :: 表示该词未知\n", "tkn = Tokenizer(spacy)\n", "print(coll_repr(tkn(txt), 31))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VmTpr2tqGxeK", "outputId": "4086dd20-0da3-48db-d744-a3d0ee7b6251" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(#151) ['xxbos','xxmaj','this','xxmaj','movie','is','complete','crap','!','xxmaj','avoid','this','waste','of','celluloid','at','all','costs',',','it','is','rambling','and','incoherent','.','i','pride','myself','on','plumbing','the'...]\n" ] } ] }, { "cell_type": "code", "source": [ "defaults.text_proc_rules" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kVULyjEOG9Jo", "outputId": "1d32355e-4276-4607-9c17-480ca1c60a3c" }, "execution_count": 9, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "# 子词分词\n", "txts = L(o.open().read() for o in files[:2000])" ], "metadata": { "id": "vdVeOoSzIwn4" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "def subword(sz):\n", " sp = SubwordTokenizer(vocab_sz=sz)\n", " sp.setup(txts)\n", " return ' '.join(first(sp([txt]))[:40])" ], "metadata": { "id": "RFpl_csBLMiQ" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "subword(1000)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "R0zNY7fyLPPn", "outputId": "0ce19264-b1cb-4f32-d386-a16df3dbf699" }, "execution_count": 12, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "'▁This ▁M o v ie ▁is ▁complete ▁crap ! ▁A void ▁this ▁waste ▁of ▁c ell ul o id ▁at ▁all ▁co s t s , ▁it ▁is ▁ ra mb l ing ▁and ▁in co h er ent .'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "subword(200)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "7guVmVMZLRFi", "outputId": "c3ef4298-a0bb-4d8e-aad6-0a5895f25eea" }, "execution_count": 13, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "'▁ T h i s ▁ M o v i e ▁is ▁co m p le t e ▁c ra p ! ▁A v o i d ▁this ▁was t e ▁of ▁c e l l u l o i'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "subword(10000)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 52 }, "id": "tTO9uVrfVmjN", "outputId": "81f79a11-bd17-45e1-f7a9-bfd5565931bf" }, "execution_count": 14, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "\"▁This ▁Movie ▁is ▁complete ▁crap ! ▁Avoid ▁this ▁waste ▁of ▁celluloid ▁at ▁all ▁costs , ▁it ▁is ▁rambling ▁and ▁incoherent . ▁I ▁pride ▁myself ▁on ▁plumb ing ▁the ▁depth s ▁of ▁70 ' s ▁sleaze ▁cinema ▁from ▁everything ▁from ▁Sa\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "# 使用fastai进行数值化\n", "toks = tkn(txt)\n", "print(coll_repr(tkn(txt), 31))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Z4VM59aIVzC_", "outputId": "37fda435-471f-4b1f-b0ba-6c3258548b6d" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(#151) ['xxbos','xxmaj','this','xxmaj','movie','is','complete','crap','!','xxmaj','avoid','this','waste','of','celluloid','at','all','costs',',','it','is','rambling','and','incoherent','.','i','pride','myself','on','plumbing','the'...]\n" ] } ] }, { "cell_type": "code", "source": [ "toks200 = txts[:200].map(tkn)\n", "toks200[0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ReuUYKv0Y9fz", "outputId": "30a7ad78-89b8-4422-9fe9-6b0a1ea420a0" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(#151) ['xxbos','xxmaj','this','xxmaj','movie','is','complete','crap','!','xxmaj'...]" ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "num = Numericalize()\n", "num.setup(toks200)\n", "coll_repr(num.vocab,20)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "VPr_sBFbZJka", "outputId": "eb200b75-2a91-46b5-b330-581229282eb1" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\"(#1960) ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj','the',',','.','a','and','to','of','i','it','is','in'...]\"" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "nums = num(toks)[:20]; nums" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YqaslX-DZMYN", "outputId": "f2c6a4e2-5a11-4eae-98d5-0a6b74d6cbb2" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "TensorText([ 2, 8, 20, 8, 27, 18, 313, 240, 44, 8, 616, 20, 196, 15, 0, 48, 47, 843, 10, 17])" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "' '.join(num.vocab[o] for o in nums)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "oOVjd5KDZy-b", "outputId": "5311609d-c8d4-4279-822b-53f0d61ee7df" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'xxbos xxmaj this xxmaj movie is complete crap ! xxmaj avoid this waste of xxunk at all costs , it'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "# 将我们的文本分批用于语言模型\n", "stream = \"In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\\nThen we will study how we build a language model and train it for a while.\"\n", "tokens = tkn(stream)\n", "bs,seq_len = 6,15\n", "d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])\n", "df = pd.DataFrame(d_tokens)\n", "display(HTML(df.to_html(index=False,header=None)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 205 }, "id": "8nHLGIplZ2Sw", "outputId": "be36437d-820f-403d-f1d6-409496b25fc8" }, "execution_count": 20, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xxbosxxmajinthischapter,wewillgobackovertheexampleofclassifying
moviereviewswestudiedinchapter1anddigdeeperunderthesurface.xxmaj
firstwewilllookattheprocessingstepsnecessarytoconverttextintonumbersand
howtocustomizeit.xxmajbydoingthis,we'llhaveanotherexample
ofthepreprocessorusedinthedatablockxxupapi.\\nxxmajthenwe
willstudyhowwebuildalanguagemodelandtrainitforawhile.
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "bs,seq_len = 6,5\n", "d_tokens = np.array([tokens[i*15:i*15+seq_len] for i in range(bs)])\n", "df = pd.DataFrame(d_tokens)\n", "display(HTML(df.to_html(index=False,header=None)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 205 }, "id": "yciR9zWraC5d", "outputId": "793f34b2-ffdb-438b-d051-638a200b75af" }, "execution_count": 21, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xxbosxxmajinthischapter
moviereviewswestudiedin
firstwewilllookat
howtocustomizeit.
ofthepreprocessorusedin
willstudyhowwebuild
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "bs,seq_len = 6,5\n", "d_tokens = np.array([tokens[i*15+seq_len:i*15+2*seq_len] for i in range(bs)])\n", "df = pd.DataFrame(d_tokens)\n", "display(HTML(df.to_html(index=False,header=None)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 205 }, "id": "RURzn-OAf4cR", "outputId": "72837374-20dc-4025-c07c-be267a9b8482" }, "execution_count": 22, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
,wewillgoback
chapter1anddigdeeper
theprocessingstepsnecessaryto
xxmajbydoingthis,
thedatablockxxupapi
alanguagemodelandtrain
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "bs,seq_len = 6,5\n", "d_tokens = np.array([tokens[i*15+10:i*15+15] for i in range(bs)])\n", "df = pd.DataFrame(d_tokens)\n", "display(HTML(df.to_html(index=False,header=None)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 205 }, "id": "DurIpeEtgJFF", "outputId": "6a674224-e337-447a-af24-f17162fcc93d" }, "execution_count": 23, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
overtheexampleofclassifying
underthesurface.xxmaj
converttextintonumbersand
we'llhaveanotherexample
.\\nxxmajthenwe
itforawhile.
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "nums200 = toks200.map(num)" ], "metadata": { "id": "A1ZvH5jKgSD2" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "dl = LMDataLoader(nums200)" ], "metadata": { "id": "0pQZS62DgolS" }, "execution_count": 25, "outputs": [] }, { "cell_type": "code", "source": [ "x,y = first(dl)\n", "x.shape,y.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BrSoG8B3gqS8", "outputId": "a1afb517-1edb-414a-ad4f-75c2e30dcd17" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(torch.Size([64, 72]), torch.Size([64, 72]))" ] }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "' '.join(num.vocab[o] for o in x[0][:20])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "u0xc_UTngs1G", "outputId": "936bee63-6a98-46a3-83d0-09c5bff56079" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'xxbos xxmaj this xxmaj movie is complete crap ! xxmaj avoid this waste of xxunk at all costs , it'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "' '.join(num.vocab[o] for o in y[0][:20])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "RGvDk6Kugutz", "outputId": "ae082e9f-a644-4f4a-9ed6-e9acc59832cc" }, "execution_count": 28, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'xxmaj this xxmaj movie is complete crap ! xxmaj avoid this waste of xxunk at all costs , it is'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "# 文本生成器\n", "# 使用 TextBlock 来创建语言模型\n", "get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])\n", "\n", "dls_lm = DataBlock(\n", " blocks=TextBlock.from_folder(path, is_lm=True),\n", " get_items=get_imdb, splitter=RandomSplitter(0.1)\n", ").dataloaders(path, path=path, bs=128, seq_len=80)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 72 }, "id": "KvMdJU0EgyQK", "outputId": "c29b7657-29c3-4610-b150-5ecb9f60c075" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/spacy/util.py:1740: UserWarning: [W111] Jupyter notebook detected: if using `prefer_gpu()` or `require_gpu()`, include it in the same cell right before `spacy.load()` to ensure that the model is loaded on the correct device. More information: http://spacy.io/usage/v3#jupyter-notebook-gpu\n", " warnings.warn(Warnings.W111)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "dls_lm.show_batch(max_n=2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 216 }, "id": "4y-hBLUTs3Pd", "outputId": "0da7db5a-54ac-4abf-c89f-6459f3d705cc" }, "execution_count": 30, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
texttext_
0xxbos i could n't believe the eye candy from start to finish . xxmaj being a fan of movies directed by music video masterminds . i am happy to report that the photography in this motion picture is a splendor for the eye to behold . xxmaj there are so many rich , full images that are put before me , that each and every time that i see this movie , i find something new that i had noti could n't believe the eye candy from start to finish . xxmaj being a fan of movies directed by music video masterminds . i am happy to report that the photography in this motion picture is a splendor for the eye to behold . xxmaj there are so many rich , full images that are put before me , that each and every time that i see this movie , i find something new that i had not seen
1/ premise , competently directed but completely let down by a real clunker of a script . xxmaj it 's nearly half an hour before anything remotely happens , which would n't be so bad if we were getting to know the characters , some back story etc , but nothing . xxmaj each character is a complete no entity , bland , two dimensional , each spurting flat dialogue which you can almost imagine them reading from the pagepremise , competently directed but completely let down by a real clunker of a script . xxmaj it 's nearly half an hour before anything remotely happens , which would n't be so bad if we were getting to know the characters , some back story etc , but nothing . xxmaj each character is a complete no entity , bland , two dimensional , each spurting flat dialogue which you can almost imagine them reading from the page .
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "learn = language_model_learner(\n", " dls_lm, AWD_LSTM, drop_mult=0.3,\n", " metrics=[accuracy, Perplexity()]).to_fp16()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 37 }, "id": "UypS54hXt5IP", "outputId": "6345caa3-edc7-4a62-8fdc-0ce114f6a890" }, "execution_count": 31, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "
\n", " \n", " 100.00% [105070592/105067061 00:03<00:00]\n", "
\n", " " ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "learn.fit_one_cycle(1, 2e-2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81 }, "id": "3xMKwSdnucG8", "outputId": "d0524cc9-f611-4889-ee1c-1b4f76f72888" }, "execution_count": 32, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracyperplexitytime
04.0157013.8981740.30099349.31230213:29
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "learn.save('1epoch')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sCpy-QKavO_b", "outputId": "7371293b-1c6c-4895-c80a-7c1796c6f270" }, "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Path('/root/.fastai/data/imdb/models/1epoch.pth')" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "code", "source": [ "learn = learn.load('1epoch')" ], "metadata": { "id": "5YoyQ73U3Vva" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "learn.unfreeze()\n", "learn.fit_one_cycle(10, 2e-3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 505 }, "id": "Afi8MDqE3XQ0", "outputId": "19876937-5356-4eef-d385-85d06c42475c" }, "execution_count": 35, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", " \n", " 20.00% [2/10 28:15<1:53:01]\n", "
\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracyperplexitytime
03.7601953.7575040.31710342.84137014:23
13.6959393.7024580.32358840.54684113:52

\n", "\n", "

\n", " \n", " 35.64% [938/2632 04:21<07:52 3.6350]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracyperplexitytime
03.7601953.7575040.31710342.84137014:23
13.6959393.7024580.32358840.54684113:52
23.6419483.6495890.32939138.45886613:29
33.5512993.6148840.33325037.14702213:47
43.4998523.5933770.33611536.35664713:24
53.4454903.5785170.33804035.82038913:58
63.3493583.5704740.33956935.53342113:32
73.2879133.5681420.34069435.45067213:23
83.2488523.5702730.34090135.52629513:51
93.2213703.5746160.34071935.68091613:26
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "learn.save_encoder('finetuned')" ], "metadata": { "id": "hBQ_fLw03YgQ" }, "execution_count": 36, "outputs": [] }, { "cell_type": "code", "source": [ "# 文本生成\n", "TEXT = \"I liked this movie because\"\n", "N_WORDS = 40\n", "N_SENTENCES = 2\n", "preds = [learn.predict(TEXT, N_WORDS, temperature=0.75)\n", " for _ in range(N_SENTENCES)]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 17 }, "id": "d50pIj7o3cZi", "outputId": "9706d5d0-feb1-4405-9521-ecd35aa56a1b" }, "execution_count": 37, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "print(\"\\n\".join(preds))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wvltdv3I3lLZ", "outputId": "217770d6-123b-4f52-e01b-74b9bc0aa625" }, "execution_count": 38, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "i liked this movie because it was a \" chick flick \" which i liked more than others , so i gave it a chance , and even though i am a big fan of Olivia de Havilland , i think that the\n", "i liked this movie because it actually made me smile and once i learned something about the story it really got a lot of fun . i think this movie is real a lot more than the sound track . i have to say that\n" ] } ] }, { "cell_type": "code", "source": [ "# 文本分类器\n", "dls_clas = DataBlock(\n", " blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),\n", " get_y = parent_label,\n", " get_items=partial(get_text_files, folders=['train', 'test']),\n", " splitter=GrandparentSplitter(valid_name='test')\n", ").dataloaders(path, path=path, bs=128, seq_len=72)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PFNkXFxx3mcu", "outputId": "b37286ce-8e61-43dc-8a2e-c9bf3fdcc157" }, "execution_count": 39, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/spacy/util.py:1740: UserWarning: [W111] Jupyter notebook detected: if using `prefer_gpu()` or `require_gpu()`, include it in the same cell right before `spacy.load()` to ensure that the model is loaded on the correct device. More information: http://spacy.io/usage/v3#jupyter-notebook-gpu\n", " warnings.warn(Warnings.W111)\n" ] } ] }, { "cell_type": "code", "source": [ "dls_clas.show_batch(max_n=3)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "IkvCrCGx3rk7", "outputId": "e431749b-c69d-420a-ec73-859343e8fd11" }, "execution_count": 40, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textcategory
0xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerreropos
1xxbos xxmaj by now you 've probably heard a bit about the new xxmaj disney dub of xxmaj miyazaki 's classic film , xxmaj laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky . xxmaj during late summer of 1998 , xxmaj disney released \" kiki 's xxmaj delivery xxmaj service \" on video which included a preview of the xxmaj laputa dub saying it was due out in \" 1 xxrep 3 9 \" . xxmaj it 's obviously way past that year now , but the dub has been finally completed . xxmaj and it 's not \" laputa : xxmaj castle xxmaj in xxmaj the xxmaj sky \" , just \" castle xxmaj in xxmaj the xxmaj sky \" for the dub , since xxmaj laputa is not such a nice word in xxmaj spanish ( even though they use the word xxmaj laputa many timespos
2xxbos * ! ! - xxup spoilers - ! ! * \\n\\n xxmaj before i begin this , let me say that i have had both the advantages of seeing this movie on the big screen and of having seen the \" authorized xxmaj version \" of this movie , remade by xxmaj stephen xxmaj king , himself , in 1997 . \\n\\n xxmaj both advantages made me appreciate this version of \" the xxmaj shining , \" all the more . \\n\\n xxmaj also , let me say that xxmaj i 've read xxmaj mr . xxmaj king 's book , \" the xxmaj shining \" on many occasions over the years , and while i love the book and am a huge fan of his work , xxmaj stanley xxmaj kubrick 's retelling of this story is far more compelling … and xxup scary . \\n\\n xxmaj kubrickpos
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "nums_samp = toks200[:10].map(num)" ], "metadata": { "id": "wKvK6vsk3tTD" }, "execution_count": 41, "outputs": [] }, { "cell_type": "code", "source": [ "nums_samp.map(len)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "87urCHoX3wG9", "outputId": "dc35a59a-030e-49f5-ba16-0ba30178aeb3" }, "execution_count": 42, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(#10) [151,449,162,159,302,211,234,169,348,219]" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "code", "source": [ "# 创建模型对文本进行分类\n", "learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5,\n", " metrics=accuracy).to_fp16()" ], "metadata": { "id": "MKb6snSb3x7V" }, "execution_count": 43, "outputs": [] }, { "cell_type": "code", "source": [ "learn = learn.load_encoder('finetuned')" ], "metadata": { "id": "UqIMzctJ6Km7" }, "execution_count": 44, "outputs": [] }, { "cell_type": "code", "source": [ "learn.fit_one_cycle(1, 2e-2)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81 }, "id": "Yrr088v86MaT", "outputId": "52e1e07d-2469-4acf-e869-247aac193d93" }, "execution_count": 45, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n" ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracytime
00.2429150.1803340.93172001:05
" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "learn.freeze_to(-2)\n", "learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81 }, "id": "m6WF3TU66OcP", "outputId": "85ef86fc-1073-4db6-c437-62aec4db711f" }, "execution_count": 46, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracytime
00.2260350.1637800.93864001:04
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ] }, { "cell_type": "code", "source": [ "learn.freeze_to(-3)\n", "learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))" ], "metadata": { "id": "9adDCiCg6Q9E" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "learn.unfreeze()\n", "learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 112 }, "id": "U4J50X736SYn", "outputId": "e8ac49b2-90a3-47d9-c878-94094d4ab710" }, "execution_count": 48, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
epochtrain_lossvalid_lossaccuracytime
00.1564990.1473190.94744001:14
10.1440320.1445210.94772001:12
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "lDFDRAKY6T-6" }, "execution_count": null, "outputs": [] } ] }