Delete word2vec TA.ipynb

33b95a84 · Boas Demeson Pangaribuan · 60e24119 · 60e24119
Commit 33b95a84 authored Apr 02, 2020 by Boas Demeson Pangaribuan
Hide whitespace changes
Inline Side-by-side

Showing with 0 additions and 1465 deletions

word2vec TA.ipynb word2vec TA.ipynb +0 -1465

No files found.
--- a/word2vec TA.ipynb
+++ b/word2vec TA.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>SERIAL NO</th>\n",
-       "      <th>NAME</th>\n",
-       "      <th>CATEGORY</th>\n",
-       "      <th>DESCRIPTION &amp; COLOR</th>\n",
-       "      <th>FABRIC</th>\n",
-       "      <th>IMAGE</th>\n",
-       "      <th>SIZE</th>\n",
-       "      <th>PRICE</th>\n",
-       "      <th>PRODUCT ID</th>\n",
-       "      <th>WEBSITE</th>\n",
-       "      <th>PRODUCT URL</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
-       "      <td>Genuine leather</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Height: 11.5 cm</td>\n",
-       "      <td>809</td>\n",
-       "      <td>1943420</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/wallets/us-polo-assn/us...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2</td>\n",
-       "      <td>Baggit Men Black Solid Two Fold Wallet</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Baggit Men Black Solid Two Fold Wallet,  Baggi...</td>\n",
-       "      <td>PU</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Height:</td>\n",
-       "      <td>720</td>\n",
-       "      <td>4608404</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/wallets/baggit/baggit-m...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>HRX by Hrithik Roshan Men Grey Solid Baseball Cap</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>HRX By Hrithik Roshan Men Grey Solid Baseball ...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>279</td>\n",
-       "      <td>2178513</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/caps/hrx-by-hrithik-ros...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>4</td>\n",
-       "      <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>499</td>\n",
-       "      <td>6699035</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/caps/puma/puma-unisex-g...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>5</td>\n",
-       "      <td>FabSeasons Beige Solid Scarf</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>FabSeasons Beige Solid Scarf, FabSeasons, Scar...</td>\n",
-       "      <td>Acrylic</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Length:0.9 m</td>\n",
-       "      <td>449</td>\n",
-       "      <td>2439658</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/scarves/fabseasons/fabs...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>6</td>\n",
-       "      <td>Ed Hardy Men Black Embellished Belt</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Ed Hardy Men Black Embellished Belt,  Ed Hardy...</td>\n",
-       "      <td>Leather</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Width: 3.7 cm</td>\n",
-       "      <td>1199</td>\n",
-       "      <td>2238752</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/belts/ed-hardy/ed-hardy...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>7</td>\n",
-       "      <td>Roadster Men Tan Brown Leather Belt</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Roadster Men Tan Brown Leather Belt, Roadster,...</td>\n",
-       "      <td>Leather</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Width: 4 cm</td>\n",
-       "      <td>419</td>\n",
-       "      <td>2975974</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/belts/roadster/roadster...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>8</td>\n",
-       "      <td>Peora Silver-Toned Rhodium-Plated Stone-Studde...</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Peora Silver Toned Rhodium Plated Stone Studde...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>551</td>\n",
-       "      <td>3006095</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/ring/peora/peora-silver...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>9</td>\n",
-       "      <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>3500</td>\n",
-       "      <td>2242802</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/helmets/royal-enfield/r...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>10</td>\n",
-       "      <td>BuckleUp Men Black Leather Belt</td>\n",
-       "      <td>accessories</td>\n",
-       "      <td>BuckleUp Men Black Leather Belt, BuckleUp, Bel...</td>\n",
-       "      <td>Leather</td>\n",
-       "      <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
-       "      <td>Width: 3.5 cm</td>\n",
-       "      <td>517</td>\n",
-       "      <td>1734718</td>\n",
-       "      <td>Myntra</td>\n",
-       "      <td>https://www.myntra.com/belts/buckleup/buckleup...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   SERIAL NO                                               NAME     CATEGORY  \\\n",
-       "0          1  U.S. Polo Assn. Men Brown Genuine Leather Two ...  accessories   \n",
-       "1          2             Baggit Men Black Solid Two Fold Wallet  accessories   \n",
-       "2          3  HRX by Hrithik Roshan Men Grey Solid Baseball Cap  accessories   \n",
-       "3          4  Puma Unisex Grey Style Military Solid Baseball...  accessories   \n",
-       "4          5                       FabSeasons Beige Solid Scarf  accessories   \n",
-       "5          6                Ed Hardy Men Black Embellished Belt  accessories   \n",
-       "6          7                Roadster Men Tan Brown Leather Belt  accessories   \n",
-       "7          8  Peora Silver-Toned Rhodium-Plated Stone-Studde...  accessories   \n",
-       "8          9  Royal Enfield Unisex White Urban Trooper Helme...  accessories   \n",
-       "9         10                    BuckleUp Men Black Leather Belt  accessories   \n",
-       "\n",
-       "                                 DESCRIPTION & COLOR            FABRIC  \\\n",
-       "0  U.S. Polo Assn. Men Brown Genuine Leather Two ...  Genuine leather    \n",
-       "1  Baggit Men Black Solid Two Fold Wallet,  Baggi...               PU    \n",
-       "2  HRX By Hrithik Roshan Men Grey Solid Baseball ...               NaN   \n",
-       "3  Puma Unisex Grey Style Military Solid Baseball...               NaN   \n",
-       "4  FabSeasons Beige Solid Scarf, FabSeasons, Scar...          Acrylic    \n",
-       "5  Ed Hardy Men Black Embellished Belt,  Ed Hardy...          Leather    \n",
-       "6  Roadster Men Tan Brown Leather Belt, Roadster,...           Leather   \n",
-       "7  Peora Silver Toned Rhodium Plated Stone Studde...               NaN   \n",
-       "8  Royal Enfield Unisex White Urban Trooper Helme...               NaN   \n",
-       "9  BuckleUp Men Black Leather Belt, BuckleUp, Bel...           Leather   \n",
-       "\n",
-       "                                               IMAGE             SIZE PRICE  \\\n",
-       "0  https://assets.myntassets.com/h_1440,q_100,w_1...  Height: 11.5 cm   809   \n",
-       "1  https://assets.myntassets.com/h_1440,q_100,w_1...          Height:   720   \n",
-       "2  https://assets.myntassets.com/h_1440,q_100,w_1...              NaN   279   \n",
-       "3  https://assets.myntassets.com/h_1440,q_100,w_1...              NaN   499   \n",
-       "4  https://assets.myntassets.com/h_1440,q_100,w_1...     Length:0.9 m   449   \n",
-       "5  https://assets.myntassets.com/h_1440,q_100,w_1...    Width: 3.7 cm  1199   \n",
-       "6  https://assets.myntassets.com/h_1440,q_100,w_1...      Width: 4 cm   419   \n",
-       "7  https://assets.myntassets.com/h_1440,q_100,w_1...              NaN   551   \n",
-       "8  https://assets.myntassets.com/h_1440,q_100,w_1...              NaN  3500   \n",
-       "9  https://assets.myntassets.com/h_1440,q_100,w_1...    Width: 3.5 cm   517   \n",
-       "\n",
-       "   PRODUCT ID WEBSITE                                        PRODUCT URL  \n",
-       "0     1943420  Myntra  https://www.myntra.com/wallets/us-polo-assn/us...  \n",
-       "1     4608404  Myntra  https://www.myntra.com/wallets/baggit/baggit-m...  \n",
-       "2     2178513  Myntra  https://www.myntra.com/caps/hrx-by-hrithik-ros...  \n",
-       "3     6699035  Myntra  https://www.myntra.com/caps/puma/puma-unisex-g...  \n",
-       "4     2439658  Myntra  https://www.myntra.com/scarves/fabseasons/fabs...  \n",
-       "5     2238752  Myntra  https://www.myntra.com/belts/ed-hardy/ed-hardy...  \n",
-       "6     2975974  Myntra  https://www.myntra.com/belts/roadster/roadster...  \n",
-       "7     3006095  Myntra  https://www.myntra.com/ring/peora/peora-silver...  \n",
-       "8     2242802  Myntra  https://www.myntra.com/helmets/royal-enfield/r...  \n",
-       "9     1734718  Myntra  https://www.myntra.com/belts/buckleup/buckleup...  "
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import random\n",
-    "\n",
-    "do = pd.DataFrame()\n",
-    "do = pd.read_csv('men-products.csv', encoding='utf-8')\n",
-    "do.head(10)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# ignore the tuple"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>total_missing</th>\n",
-       "      <th>percent_missing</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>SERIAL NO</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>NAME</th>\n",
-       "      <td>1</td>\n",
-       "      <td>0.005</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>CATEGORY</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>DESCRIPTION &amp; COLOR</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>FABRIC</th>\n",
-       "      <td>4833</td>\n",
-       "      <td>24.165</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>IMAGE</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>SIZE</th>\n",
-       "      <td>3838</td>\n",
-       "      <td>19.190</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>PRICE</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>PRODUCT ID</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>WEBSITE</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>PRODUCT URL</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                     total_missing  percent_missing\n",
-       "SERIAL NO                        0            0.000\n",
-       "NAME                             1            0.005\n",
-       "CATEGORY                         0            0.000\n",
-       "DESCRIPTION & COLOR              0            0.000\n",
-       "FABRIC                        4833           24.165\n",
-       "IMAGE                            0            0.000\n",
-       "SIZE                          3838           19.190\n",
-       "PRICE                            0            0.000\n",
-       "PRODUCT ID                       0            0.000\n",
-       "WEBSITE                          0            0.000\n",
-       "PRODUCT URL                      0            0.000"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "missing_data = pd.DataFrame({'total_missing': do.isnull().sum(), 'percent_missing': (do.isnull().sum()/20000)*100})\n",
-    "missing_data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# data compression with lossy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del do['FABRIC']\n",
-    "del do['IMAGE']\n",
-    "del do['SIZE']\n",
-    "del do['WEBSITE']\n",
-    "del do['PRODUCT URL']\n",
-    "del do['PRICE']\n",
-    "del do['PRODUCT ID']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = do.sample(10000, random_state=1).copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Rename kategori produk\n",
-    "df.replace({'CATEGORY': \n",
-    "             {'accessories': 'Accesories', \n",
-    "              'casual-shirts': 'Casual Shirts',\n",
-    "              'Men-Casual-Trousers': 'Men Casual Trousers',\n",
-    "              'formal-shirts': 'Formal Shirts',\n",
-    "              'Men-Formal-Trousers': 'Men Formal Trousers',\n",
-    "              'men-jackets-coats': 'Men Jackets Coats',\n",
-    "              'men-swimwear': 'Men Swimwear',\n",
-    "              'men-suits': 'Men Suits'}}, \n",
-    "              inplace= True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>SERIAL NO</th>\n",
-       "      <th>NAME</th>\n",
-       "      <th>CATEGORY</th>\n",
-       "      <th>DESCRIPTION &amp; COLOR</th>\n",
-       "      <th>CATEGORY_ID</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>25552</th>\n",
-       "      <td>25553</td>\n",
-       "      <td>Fort Collins Men Red Solid Padded Jacket</td>\n",
-       "      <td>Men Jackets Coats</td>\n",
-       "      <td>Fort Collins Men Red Solid Padded Jacket,  For...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18639</th>\n",
-       "      <td>18640</td>\n",
-       "      <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
-       "      <td>Men Formal Trousers</td>\n",
-       "      <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18542</th>\n",
-       "      <td>18543</td>\n",
-       "      <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
-       "      <td>Men Formal Trousers</td>\n",
-       "      <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21474</th>\n",
-       "      <td>21475</td>\n",
-       "      <td>Hanes Charcoal Grey Thermal T-Shirt</td>\n",
-       "      <td>Innerwear &amp; Sleapwear</td>\n",
-       "      <td>Hanes Charcoal Grey Thermal T Shirt,  Hanes, T...</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14858</th>\n",
-       "      <td>14859</td>\n",
-       "      <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
-       "      <td>Formal Shirts</td>\n",
-       "      <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       SERIAL NO                                               NAME  \\\n",
-       "25552      25553           Fort Collins Men Red Solid Padded Jacket   \n",
-       "18639      18640  MANGO MAN Men Navy Blue Tailored Slim Fit Soli...   \n",
-       "18542      18543  Arrow Men Navy Blue Tapered Fit Checked Formal...   \n",
-       "21474      21475                Hanes Charcoal Grey Thermal T-Shirt   \n",
-       "14858      14859  Hancock Men Blue Regular Fit Striped Formal Shirt   \n",
-       "\n",
-       "                    CATEGORY  \\\n",
-       "25552      Men Jackets Coats   \n",
-       "18639    Men Formal Trousers   \n",
-       "18542    Men Formal Trousers   \n",
-       "21474  Innerwear & Sleapwear   \n",
-       "14858          Formal Shirts   \n",
-       "\n",
-       "                                     DESCRIPTION & COLOR  CATEGORY_ID  \n",
-       "25552  Fort Collins Men Red Solid Padded Jacket,  For...            0  \n",
-       "18639  MANGO MAN Men Navy Blue Tailored Slim Fit Soli...            1  \n",
-       "18542  Arrow Men Navy Blue Tapered Fit Checked Formal...            1  \n",
-       "21474  Hanes Charcoal Grey Thermal T Shirt,  Hanes, T...            2  \n",
-       "14858  Hancock Men Blue Regular Fit Striped Formal Sh...            3  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Create a new column 'category_id' with encoded categories \n",
-    "df['CATEGORY_ID'] = df['CATEGORY'].factorize()[0]\n",
-    "CATEGORY_ID_DF = df[['CATEGORY', 'CATEGORY_ID']].drop_duplicates()\n",
-    "\n",
-    "\n",
-    "# Dictionaries for future use\n",
-    "CATEGORY_TO_ID = dict(CATEGORY_ID_DF.values)\n",
-    "ID_TO_CATEGORY = dict(CATEGORY_ID_DF[['CATEGORY_ID', 'CATEGORY']].values)\n",
-    "\n",
-    "# New dataframe\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# rename column"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.rename(columns = {'DESCRIPTION & COLOR':'DESCRIPTION'}, inplace = True) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>SERIAL NO</th>\n",
-       "      <th>NAME</th>\n",
-       "      <th>CATEGORY</th>\n",
-       "      <th>DESCRIPTION</th>\n",
-       "      <th>CATEGORY_ID</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>25552</th>\n",
-       "      <td>25553</td>\n",
-       "      <td>Fort Collins Men Red Solid Padded Jacket</td>\n",
-       "      <td>Men Jackets Coats</td>\n",
-       "      <td>Fort Collins Men Red Solid Padded Jacket,  For...</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18639</th>\n",
-       "      <td>18640</td>\n",
-       "      <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
-       "      <td>Men Formal Trousers</td>\n",
-       "      <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18542</th>\n",
-       "      <td>18543</td>\n",
-       "      <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
-       "      <td>Men Formal Trousers</td>\n",
-       "      <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21474</th>\n",
-       "      <td>21475</td>\n",
-       "      <td>Hanes Charcoal Grey Thermal T-Shirt</td>\n",
-       "      <td>Innerwear &amp; Sleapwear</td>\n",
-       "      <td>Hanes Charcoal Grey Thermal T Shirt,  Hanes, T...</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14858</th>\n",
-       "      <td>14859</td>\n",
-       "      <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
-       "      <td>Formal Shirts</td>\n",
-       "      <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>59142</th>\n",
-       "      <td>59190</td>\n",
-       "      <td>Tantra Men Black Printed Round Neck T-shirt</td>\n",
-       "      <td>T-Shirts</td>\n",
-       "      <td>Tantra Men Black Printed Round Neck T Shirt,  ...</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35661</th>\n",
-       "      <td>35662</td>\n",
-       "      <td>Aeropostale Men Blue Regular Fit Mid-Rise Mild...</td>\n",
-       "      <td>Jeans</td>\n",
-       "      <td>Aeropostale Men Blue Regular Fit Mid Rise Mild...</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3631</th>\n",
-       "      <td>3632</td>\n",
-       "      <td>ether Men Navy Blue Slim Fit Anti Microbial Co...</td>\n",
-       "      <td>Casual Shirts</td>\n",
-       "      <td>Ether Men Navy Blue Slim Fit Anti Microbial Co...</td>\n",
-       "      <td>6</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>26605</th>\n",
-       "      <td>26606</td>\n",
-       "      <td>Roadster Men White Regular Fit Mid-Rise Clean ...</td>\n",
-       "      <td>Jeans</td>\n",
-       "      <td>Roadster Men White Regular Fit Mid Rise Clean ...</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21322</th>\n",
-       "      <td>21323</td>\n",
-       "      <td>Dollar Bigboss Pack of 3 Trunks MDTR-03-PO3-4</td>\n",
-       "      <td>Innerwear &amp; Sleapwear</td>\n",
-       "      <td>Dollar Bigboss Pack Of 3 Trunks MDTR 03 PO3 4,...</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       SERIAL NO                                               NAME  \\\n",
-       "25552      25553           Fort Collins Men Red Solid Padded Jacket   \n",
-       "18639      18640  MANGO MAN Men Navy Blue Tailored Slim Fit Soli...   \n",
-       "18542      18543  Arrow Men Navy Blue Tapered Fit Checked Formal...   \n",
-       "21474      21475                Hanes Charcoal Grey Thermal T-Shirt   \n",
-       "14858      14859  Hancock Men Blue Regular Fit Striped Formal Shirt   \n",
-       "59142      59190        Tantra Men Black Printed Round Neck T-shirt   \n",
-       "35661      35662  Aeropostale Men Blue Regular Fit Mid-Rise Mild...   \n",
-       "3631        3632  ether Men Navy Blue Slim Fit Anti Microbial Co...   \n",
-       "26605      26606  Roadster Men White Regular Fit Mid-Rise Clean ...   \n",
-       "21322      21323      Dollar Bigboss Pack of 3 Trunks MDTR-03-PO3-4   \n",
-       "\n",
-       "                    CATEGORY  \\\n",
-       "25552      Men Jackets Coats   \n",
-       "18639    Men Formal Trousers   \n",
-       "18542    Men Formal Trousers   \n",
-       "21474  Innerwear & Sleapwear   \n",
-       "14858          Formal Shirts   \n",
-       "59142               T-Shirts   \n",
-       "35661                  Jeans   \n",
-       "3631           Casual Shirts   \n",
-       "26605                  Jeans   \n",
-       "21322  Innerwear & Sleapwear   \n",
-       "\n",
-       "                                             DESCRIPTION  CATEGORY_ID  \n",
-       "25552  Fort Collins Men Red Solid Padded Jacket,  For...            0  \n",
-       "18639  MANGO MAN Men Navy Blue Tailored Slim Fit Soli...            1  \n",
-       "18542  Arrow Men Navy Blue Tapered Fit Checked Formal...            1  \n",
-       "21474  Hanes Charcoal Grey Thermal T Shirt,  Hanes, T...            2  \n",
-       "14858  Hancock Men Blue Regular Fit Striped Formal Sh...            3  \n",
-       "59142  Tantra Men Black Printed Round Neck T Shirt,  ...            4  \n",
-       "35661  Aeropostale Men Blue Regular Fit Mid Rise Mild...            5  \n",
-       "3631   Ether Men Navy Blue Slim Fit Anti Microbial Co...            6  \n",
-       "26605  Roadster Men White Regular Fit Mid Rise Clean ...            5  \n",
-       "21322  Dollar Bigboss Pack Of 3 Trunks MDTR 03 PO3 4,...            2  "
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train = df.loc[:1000, 'DESCRIPTION'].values\n",
-    "y_train = df.loc[:1000, 'CATEGORY_ID'].values\n",
-    "X_test = df.loc[:1000, 'DESCRIPTION'].values\n",
-    "y_test = df.loc[:1000, 'CATEGORY_ID'].values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tensorflow.python.keras.preprocessing.text import Tokenizer\n",
-    "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n",
-    "\n",
-    "tokenizer_obj = Tokenizer()\n",
-    "total_descriptions = X_train + X_test\n",
-    "tokenizer_obj.fit_on_texts(total_descriptions)\n",
-    "\n",
-    "# pad sequences\n",
-    "max_length = max([len(s.split()) for s in total_descriptions])\n",
-    "\n",
-    "#define vocabulary size\n",
-    "vocab_size = len(tokenizer_obj.word_index) + 1\n",
-    "\n",
-    "X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)\n",
-    "X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)\n",
-    "\n",
-    "X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')\n",
-    "X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# learn word embedding"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# build model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using TensorFlow backend.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Build model...\n"
-     ]
-    }
-   ],
-   "source": [
-    "from keras.models import Sequential\n",
-    "from keras.layers import Dense, Embedding, LSTM, GRU\n",
-    "from keras.layers.embeddings import Embedding\n",
-    "\n",
-    "EMBEDDING_DIM = 100\n",
-    "\n",
-    "print('Build model...')\n",
-    "\n",
-    "model = Sequential()\n",
-    "model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))\n",
-    "model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))\n",
-    "model.add(Dense(1, activation='sigmoid'))\n",
-    "\n",
-    "# try using different optimizers and different optimizer configs\n",
-    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# train model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\User\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\framework\\indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
-      "  \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train on 7853 samples, validate on 7853 samples\n",
-      "Epoch 1/2\n",
-      " - 6s - loss: -1.2412e+10 - accuracy: 0.0507 - val_loss: -2.1084e+01 - val_accuracy: 0.0506\n",
-      "Epoch 2/2\n",
-      " - 6s - loss: -1.9216e+11 - accuracy: 0.0506 - val_loss: -3.2632e+01 - val_accuracy: 0.0506\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<keras.callbacks.callbacks.History at 0x170704eeef0>"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print('Train...')\n",
-    "\n",
-    "model.fit(X_train_pad, y_train, batch_size=128, epochs=2, validation_data=(X_test_pad, y_test), verbose=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# test model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[0.99995184],\n",
-       "       [0.99995184],\n",
-       "       [0.99995184],\n",
-       "       [0.99995184],\n",
-       "       [0.99995184]], dtype=float32)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#Let us test some samples\n",
-    "\n",
-    "test_sample_1 = \"Puma Unisex Gery Style\"\n",
-    "test_sample_2 = \"Good jeans!\"\n",
-    "test_sample_3 = \"Maybe I like this jeans.\"\n",
-    "test_sample_4 = \"Not to my taste, will skip and choose another jeans\"\n",
-    "test_sample_5 = \"Bad jeans\"\n",
-    "test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5]\n",
-    "\n",
-    "test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)\n",
-    "test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)\n",
-    "\n",
-    "#predict\n",
-    "model.predict(x=test_samples_tokens_pad)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# train word2vec embedding\n",
-    "# text preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import string\n",
-    "from nltk.tokenize import word_tokenize\n",
-    "from nltk.corpus import stopwords\n",
-    "\n",
-    "description_lines = list()\n",
-    "lines = df['DESCRIPTION'].values.tolist()\n",
-    "\n",
-    "for line in lines:\n",
-    "    tokens = word_tokenize(line)\n",
-    "    # convert to lower case\n",
-    "    tokens = [w.lower() for w in tokens]\n",
-    "    # remove punctuation from each word\n",
-    "    table = str.maketrans('','', string.punctuation)\n",
-    "    stripped = [w.translate(table) for w in tokens]\n",
-    "    # remove remaining tokens that are not alphabetic\n",
-    "    words = [word for word in stripped if word.isalpha()]\n",
-    "    #filter out stop words\n",
-    "    stop_words = set(stopwords.words('english'))\n",
-    "    words = [w for w in words if not w in stop_words]\n",
-    "    description_lines.append(words)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "10000"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(description_lines)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Vocabulary size: 1543\n"
-     ]
-    }
-   ],
-   "source": [
-    "from gensim.models import Word2Vec\n",
-    "\n",
-    "#train word2vec model\n",
-    "model = Word2Vec(sentences=description_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)\n",
-    "#vocab size\n",
-    "words = list(model.wv.vocab)\n",
-    "print('Vocabulary size: %d' % len(words))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# test word2vec model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[('maroon', 0.6926736235618591),\n",
-       " ('grey', 0.6636859774589539),\n",
-       " ('brown', 0.6268335580825806),\n",
-       " ('white', 0.6045646071434021),\n",
-       " ('burgundy', 0.5724130868911743),\n",
-       " ('orange', 0.5616232752799988),\n",
-       " ('yellow', 0.5448090434074402),\n",
-       " ('blue', 0.5399770736694336),\n",
-       " ('beige', 0.5213257074356079),\n",
-       " ('mauve', 0.5187932848930359)]"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.wv.most_similar('black')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[('sp', 0.8349928855895996),\n",
-       " ('wars', 0.7996177673339844),\n",
-       " ('regallo', 0.761871874332428),\n",
-       " ('high', 0.7507779598236084),\n",
-       " ('star', 0.7447826266288757),\n",
-       " ('powell', 0.7392982840538025),\n",
-       " ('ice', 0.7346605062484741),\n",
-       " ('dyed', 0.7304413318634033),\n",
-       " ('density', 0.7266365885734558),\n",
-       " ('connection', 0.7252252101898193)]"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#let's see the result of semantically reasonable word vectors\n",
-    "model.wv.most_similar_cosmul(positive=['woman', 'jeans'], negative=['man'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "black\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\User\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
-      "  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n"
-     ]
-    }
-   ],
-   "source": [
-    "#odd word out\n",
-    "print(model.wv.doesnt_match(\"men black jeans\".split()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[('apparel', 0.6134017705917358),\n",
-       " ('esprit', 0.5591789484024048),\n",
-       " ('highlander', 0.5499693751335144),\n",
-       " ('ether', 0.5297696590423584),\n",
-       " ('wrangler', 0.4958726167678833),\n",
-       " ('lee', 0.49490395188331604),\n",
-       " ('izod', 0.4832093119621277),\n",
-       " ('breakbounce', 0.471257746219635),\n",
-       " ('realm', 0.46333152055740356),\n",
-       " ('aeropostale', 0.46125224232673645)]"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# look up top 10 words similar to 'men'\n",
-    "w1 = \"men\"\n",
-    "model.wv.most_similar (positive=w1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[('price', 0.6506756544113159),\n",
-       " ('online', 0.4503338932991028),\n",
-       " ('india', 0.44190680980682373),\n",
-       " ('partywear', 0.37221819162368774),\n",
-       " ('hatch', 0.33273035287857056),\n",
-       " ('sp', 0.31096935272216797)]"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# look up top 6 words similar to 'best'\n",
-    "w1 = \"best\"\n",
-    "model.wv.most_similar (positive=w1, topn=6)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.0"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#similarity between two identical words\n",
-    "model.wv.similarity(w1=\"black\", w2=\"black\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.6045646"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#similarity between two different words \n",
-    "model.wv.similarity(w1=\"black\", w2=\"white\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.62683356"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#similarity between two different words\n",
-    "model.wv.similarity(w1=\"black\", w2=\"brown\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'white'"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#odd word out\n",
-    "model.wv.doesnt_match([\"brown\", \"white\", \"black\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#save model\n",
-    "filename = 'imdb_embedding_word2vec.txt'\n",
-    "model.wv.save_word2vec_format(filename, binary=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# use pre-trained embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "\n",
-    "embeddings_index = {}\n",
-    "f = open(os.path.join('', 'imdb_embedding_word2vec.txt'), encoding = \"utf-8\")\n",
-    "for line in f:\n",
-    "    values = line.split()\n",
-    "    word = values[0]\n",
-    "    coefs = np.asarray(values[1:])\n",
-    "    embeddings_index[word] = coefs\n",
-    "f.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found 1543 unique tokens.\n",
-      "Shape of description tensor: (10000, 107)\n",
-      "Shape of category tensor: (10000,)\n"
-     ]
-    }
-   ],
-   "source": [
-    "#vectorizer the next samples into a 2D integer tensor\n",
-    "tokenizer_obj = Tokenizer()\n",
-    "tokenizer_obj.fit_on_texts(description_lines)\n",
-    "sequences = tokenizer_obj.texts_to_sequences(description_lines)\n",
-    "\n",
-    "#pad sequences\n",
-    "word_index = tokenizer_obj.word_index\n",
-    "print('Found %s unique tokens.' % len(word_index))\n",
-    "\n",
-    "description_pad = pad_sequences(sequences, maxlen=max_length)\n",
-    "category = df['CATEGORY_ID'].values\n",
-    "print('Shape of description tensor:', description_pad.shape)\n",
-    "print('Shape of category tensor:', category.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "num_words = len(word_index) + 1\n",
-    "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
-    "\n",
-    "for word, i in word_index.items():\n",
-    "    if i > num_words:\n",
-    "        continue\n",
-    "    embedding_vector = embeddings_index.get(word)\n",
-    "    if embedding_vector is not None:\n",
-    "        #words not found in embedding index will be all-zeros\n",
-    "        embedding_matrix[i] = embedding_vector"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1544\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(num_words)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from keras.models import Sequential\n",
-    "from keras.layers import Dense, Embedding, LSTM, GRU\n",
-    "from keras.layers.embeddings import Embedding\n",
-    "from keras.initializers import Constant\n",
-    "\n",
-    "#define model\n",
-    "model = Sequential()\n",
-    "embedding_layer = Embedding(num_words, \n",
-    "                            EMBEDDING_DIM, \n",
-    "                            embeddings_initializer=Constant(embedding_matrix), \n",
-    "                            input_length=max_length, \n",
-    "                            trainable=False)\n",
-    "model.add(embedding_layer)\n",
-    "model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))\n",
-    "model.add(Dense(1, activation='sigmoid'))\n",
-    "\n",
-    "#try using different optimizers and different optimizer configs\n",
-    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# split the data into a training set and a validation set\n",
-    "VALIDATION_SPLIT = 0.2\n",
-    "\n",
-    "indices = np.arange(description_pad.shape[0])\n",
-    "np.random.shuffle(indices)\n",
-    "description_pad = description_pad[indices]\n",
-    "category = category[indices]\n",
-    "num_validation_samples = int(VALIDATION_SPLIT * description_pad.shape[0])\n",
-    "\n",
-    "X_train_pad = description_pad[:-num_validation_samples]\n",
-    "y_train = category[:-num_validation_samples]\n",
-    "X_test_pad = description_pad[-num_validation_samples:]\n",
-    "y_test = category[-num_validation_samples:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Shape of X_train_pad tensor: (8000, 107)\n",
-      "Shape of y_train tensor: (8000,)\n",
-      "Shape of X_test_pad tensor: (2000, 107)\n",
-      "Shape of y_test tensor: (2000,)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('Shape of X_train_pad tensor:', X_train_pad.shape)\n",
-    "print('Shape of y_train tensor:', y_train.shape)\n",
-    "\n",
-    "print('Shape of X_test_pad tensor:', X_test_pad.shape)\n",
-    "print('Shape of y_test tensor:', y_test.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train...\n",
-      "Train on 8000 samples, validate on 2000 samples\n",
-      "Epoch 1/2\n",
-      " - 6s - loss: -1.2647e+09 - accuracy: 0.0514 - val_loss: -1.2498e+01 - val_accuracy: 0.0435\n",
-      "Epoch 2/2\n",
-      " - 5s - loss: -7.2270e+10 - accuracy: 0.0510 - val_loss: -2.0268e+01 - val_accuracy: 0.0435\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<keras.callbacks.callbacks.History at 0x1706c899f98>"
-      ]
-     },
-     "execution_count": 34,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print('Train...')\n",
-    "\n",
-    "model.fit(X_train_pad, y_train, batch_size=128, epochs=2, validation_data=(X_test_pad, y_test), verbose=2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}