Delete word2vec TA.ipynb

parent 60e24119
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SERIAL NO</th>\n",
" <th>NAME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION &amp; COLOR</th>\n",
" <th>FABRIC</th>\n",
" <th>IMAGE</th>\n",
" <th>SIZE</th>\n",
" <th>PRICE</th>\n",
" <th>PRODUCT ID</th>\n",
" <th>WEBSITE</th>\n",
" <th>PRODUCT URL</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
" <td>accessories</td>\n",
" <td>U.S. Polo Assn. Men Brown Genuine Leather Two ...</td>\n",
" <td>Genuine leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Height: 11.5 cm</td>\n",
" <td>809</td>\n",
" <td>1943420</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/wallets/us-polo-assn/us...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Baggit Men Black Solid Two Fold Wallet</td>\n",
" <td>accessories</td>\n",
" <td>Baggit Men Black Solid Two Fold Wallet, Baggi...</td>\n",
" <td>PU</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Height:</td>\n",
" <td>720</td>\n",
" <td>4608404</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/wallets/baggit/baggit-m...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>HRX by Hrithik Roshan Men Grey Solid Baseball Cap</td>\n",
" <td>accessories</td>\n",
" <td>HRX By Hrithik Roshan Men Grey Solid Baseball ...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>279</td>\n",
" <td>2178513</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/caps/hrx-by-hrithik-ros...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
" <td>accessories</td>\n",
" <td>Puma Unisex Grey Style Military Solid Baseball...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>499</td>\n",
" <td>6699035</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/caps/puma/puma-unisex-g...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>FabSeasons Beige Solid Scarf</td>\n",
" <td>accessories</td>\n",
" <td>FabSeasons Beige Solid Scarf, FabSeasons, Scar...</td>\n",
" <td>Acrylic</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Length:0.9 m</td>\n",
" <td>449</td>\n",
" <td>2439658</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/scarves/fabseasons/fabs...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Ed Hardy Men Black Embellished Belt</td>\n",
" <td>accessories</td>\n",
" <td>Ed Hardy Men Black Embellished Belt, Ed Hardy...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 3.7 cm</td>\n",
" <td>1199</td>\n",
" <td>2238752</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/ed-hardy/ed-hardy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Roadster Men Tan Brown Leather Belt</td>\n",
" <td>accessories</td>\n",
" <td>Roadster Men Tan Brown Leather Belt, Roadster,...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 4 cm</td>\n",
" <td>419</td>\n",
" <td>2975974</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/roadster/roadster...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Peora Silver-Toned Rhodium-Plated Stone-Studde...</td>\n",
" <td>accessories</td>\n",
" <td>Peora Silver Toned Rhodium Plated Stone Studde...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>551</td>\n",
" <td>3006095</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/ring/peora/peora-silver...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
" <td>accessories</td>\n",
" <td>Royal Enfield Unisex White Urban Trooper Helme...</td>\n",
" <td>NaN</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>NaN</td>\n",
" <td>3500</td>\n",
" <td>2242802</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/helmets/royal-enfield/r...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>BuckleUp Men Black Leather Belt</td>\n",
" <td>accessories</td>\n",
" <td>BuckleUp Men Black Leather Belt, BuckleUp, Bel...</td>\n",
" <td>Leather</td>\n",
" <td>https://assets.myntassets.com/h_1440,q_100,w_1...</td>\n",
" <td>Width: 3.5 cm</td>\n",
" <td>517</td>\n",
" <td>1734718</td>\n",
" <td>Myntra</td>\n",
" <td>https://www.myntra.com/belts/buckleup/buckleup...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SERIAL NO NAME CATEGORY \\\n",
"0 1 U.S. Polo Assn. Men Brown Genuine Leather Two ... accessories \n",
"1 2 Baggit Men Black Solid Two Fold Wallet accessories \n",
"2 3 HRX by Hrithik Roshan Men Grey Solid Baseball Cap accessories \n",
"3 4 Puma Unisex Grey Style Military Solid Baseball... accessories \n",
"4 5 FabSeasons Beige Solid Scarf accessories \n",
"5 6 Ed Hardy Men Black Embellished Belt accessories \n",
"6 7 Roadster Men Tan Brown Leather Belt accessories \n",
"7 8 Peora Silver-Toned Rhodium-Plated Stone-Studde... accessories \n",
"8 9 Royal Enfield Unisex White Urban Trooper Helme... accessories \n",
"9 10 BuckleUp Men Black Leather Belt accessories \n",
"\n",
" DESCRIPTION & COLOR FABRIC \\\n",
"0 U.S. Polo Assn. Men Brown Genuine Leather Two ... Genuine leather \n",
"1 Baggit Men Black Solid Two Fold Wallet, Baggi... PU \n",
"2 HRX By Hrithik Roshan Men Grey Solid Baseball ... NaN \n",
"3 Puma Unisex Grey Style Military Solid Baseball... NaN \n",
"4 FabSeasons Beige Solid Scarf, FabSeasons, Scar... Acrylic \n",
"5 Ed Hardy Men Black Embellished Belt, Ed Hardy... Leather \n",
"6 Roadster Men Tan Brown Leather Belt, Roadster,... Leather \n",
"7 Peora Silver Toned Rhodium Plated Stone Studde... NaN \n",
"8 Royal Enfield Unisex White Urban Trooper Helme... NaN \n",
"9 BuckleUp Men Black Leather Belt, BuckleUp, Bel... Leather \n",
"\n",
" IMAGE SIZE PRICE \\\n",
"0 https://assets.myntassets.com/h_1440,q_100,w_1... Height: 11.5 cm 809 \n",
"1 https://assets.myntassets.com/h_1440,q_100,w_1... Height: 720 \n",
"2 https://assets.myntassets.com/h_1440,q_100,w_1... NaN 279 \n",
"3 https://assets.myntassets.com/h_1440,q_100,w_1... NaN 499 \n",
"4 https://assets.myntassets.com/h_1440,q_100,w_1... Length:0.9 m 449 \n",
"5 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.7 cm 1199 \n",
"6 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 4 cm 419 \n",
"7 https://assets.myntassets.com/h_1440,q_100,w_1... NaN 551 \n",
"8 https://assets.myntassets.com/h_1440,q_100,w_1... NaN 3500 \n",
"9 https://assets.myntassets.com/h_1440,q_100,w_1... Width: 3.5 cm 517 \n",
"\n",
" PRODUCT ID WEBSITE PRODUCT URL \n",
"0 1943420 Myntra https://www.myntra.com/wallets/us-polo-assn/us... \n",
"1 4608404 Myntra https://www.myntra.com/wallets/baggit/baggit-m... \n",
"2 2178513 Myntra https://www.myntra.com/caps/hrx-by-hrithik-ros... \n",
"3 6699035 Myntra https://www.myntra.com/caps/puma/puma-unisex-g... \n",
"4 2439658 Myntra https://www.myntra.com/scarves/fabseasons/fabs... \n",
"5 2238752 Myntra https://www.myntra.com/belts/ed-hardy/ed-hardy... \n",
"6 2975974 Myntra https://www.myntra.com/belts/roadster/roadster... \n",
"7 3006095 Myntra https://www.myntra.com/ring/peora/peora-silver... \n",
"8 2242802 Myntra https://www.myntra.com/helmets/royal-enfield/r... \n",
"9 1734718 Myntra https://www.myntra.com/belts/buckleup/buckleup... "
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import random\n",
"\n",
"do = pd.DataFrame()\n",
"do = pd.read_csv('men-products.csv', encoding='utf-8')\n",
"do.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ignore the tuple"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>total_missing</th>\n",
" <th>percent_missing</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>SERIAL NO</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>NAME</th>\n",
" <td>1</td>\n",
" <td>0.005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CATEGORY</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>DESCRIPTION &amp; COLOR</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>FABRIC</th>\n",
" <td>4833</td>\n",
" <td>24.165</td>\n",
" </tr>\n",
" <tr>\n",
" <th>IMAGE</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SIZE</th>\n",
" <td>3838</td>\n",
" <td>19.190</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PRICE</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PRODUCT ID</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>WEBSITE</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PRODUCT URL</th>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" total_missing percent_missing\n",
"SERIAL NO 0 0.000\n",
"NAME 1 0.005\n",
"CATEGORY 0 0.000\n",
"DESCRIPTION & COLOR 0 0.000\n",
"FABRIC 4833 24.165\n",
"IMAGE 0 0.000\n",
"SIZE 3838 19.190\n",
"PRICE 0 0.000\n",
"PRODUCT ID 0 0.000\n",
"WEBSITE 0 0.000\n",
"PRODUCT URL 0 0.000"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_data = pd.DataFrame({'total_missing': do.isnull().sum(), 'percent_missing': (do.isnull().sum()/20000)*100})\n",
"missing_data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# data compression with lossy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"del do['FABRIC']\n",
"del do['IMAGE']\n",
"del do['SIZE']\n",
"del do['WEBSITE']\n",
"del do['PRODUCT URL']\n",
"del do['PRICE']\n",
"del do['PRODUCT ID']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = do.sample(10000, random_state=1).copy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Rename kategori produk\n",
"df.replace({'CATEGORY': \n",
" {'accessories': 'Accesories', \n",
" 'casual-shirts': 'Casual Shirts',\n",
" 'Men-Casual-Trousers': 'Men Casual Trousers',\n",
" 'formal-shirts': 'Formal Shirts',\n",
" 'Men-Formal-Trousers': 'Men Formal Trousers',\n",
" 'men-jackets-coats': 'Men Jackets Coats',\n",
" 'men-swimwear': 'Men Swimwear',\n",
" 'men-suits': 'Men Suits'}}, \n",
" inplace= True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SERIAL NO</th>\n",
" <th>NAME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION &amp; COLOR</th>\n",
" <th>CATEGORY_ID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>25552</th>\n",
" <td>25553</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket</td>\n",
" <td>Men Jackets Coats</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket, For...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18639</th>\n",
" <td>18640</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18542</th>\n",
" <td>18543</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21474</th>\n",
" <td>21475</td>\n",
" <td>Hanes Charcoal Grey Thermal T-Shirt</td>\n",
" <td>Innerwear &amp; Sleapwear</td>\n",
" <td>Hanes Charcoal Grey Thermal T Shirt, Hanes, T...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14858</th>\n",
" <td>14859</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
" <td>Formal Shirts</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SERIAL NO NAME \\\n",
"25552 25553 Fort Collins Men Red Solid Padded Jacket \n",
"18639 18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18542 18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21474 21475 Hanes Charcoal Grey Thermal T-Shirt \n",
"14858 14859 Hancock Men Blue Regular Fit Striped Formal Shirt \n",
"\n",
" CATEGORY \\\n",
"25552 Men Jackets Coats \n",
"18639 Men Formal Trousers \n",
"18542 Men Formal Trousers \n",
"21474 Innerwear & Sleapwear \n",
"14858 Formal Shirts \n",
"\n",
" DESCRIPTION & COLOR CATEGORY_ID \n",
"25552 Fort Collins Men Red Solid Padded Jacket, For... 0 \n",
"18639 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... 1 \n",
"18542 Arrow Men Navy Blue Tapered Fit Checked Formal... 1 \n",
"21474 Hanes Charcoal Grey Thermal T Shirt, Hanes, T... 2 \n",
"14858 Hancock Men Blue Regular Fit Striped Formal Sh... 3 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create a new column 'category_id' with encoded categories \n",
"df['CATEGORY_ID'] = df['CATEGORY'].factorize()[0]\n",
"CATEGORY_ID_DF = df[['CATEGORY', 'CATEGORY_ID']].drop_duplicates()\n",
"\n",
"\n",
"# Dictionaries for future use\n",
"CATEGORY_TO_ID = dict(CATEGORY_ID_DF.values)\n",
"ID_TO_CATEGORY = dict(CATEGORY_ID_DF[['CATEGORY_ID', 'CATEGORY']].values)\n",
"\n",
"# New dataframe\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# rename column"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df.rename(columns = {'DESCRIPTION & COLOR':'DESCRIPTION'}, inplace = True) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SERIAL NO</th>\n",
" <th>NAME</th>\n",
" <th>CATEGORY</th>\n",
" <th>DESCRIPTION</th>\n",
" <th>CATEGORY_ID</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>25552</th>\n",
" <td>25553</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket</td>\n",
" <td>Men Jackets Coats</td>\n",
" <td>Fort Collins Men Red Solid Padded Jacket, For...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18639</th>\n",
" <td>18640</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>MANGO MAN Men Navy Blue Tailored Slim Fit Soli...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18542</th>\n",
" <td>18543</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" <td>Men Formal Trousers</td>\n",
" <td>Arrow Men Navy Blue Tapered Fit Checked Formal...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21474</th>\n",
" <td>21475</td>\n",
" <td>Hanes Charcoal Grey Thermal T-Shirt</td>\n",
" <td>Innerwear &amp; Sleapwear</td>\n",
" <td>Hanes Charcoal Grey Thermal T Shirt, Hanes, T...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14858</th>\n",
" <td>14859</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Shirt</td>\n",
" <td>Formal Shirts</td>\n",
" <td>Hancock Men Blue Regular Fit Striped Formal Sh...</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59142</th>\n",
" <td>59190</td>\n",
" <td>Tantra Men Black Printed Round Neck T-shirt</td>\n",
" <td>T-Shirts</td>\n",
" <td>Tantra Men Black Printed Round Neck T Shirt, ...</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35661</th>\n",
" <td>35662</td>\n",
" <td>Aeropostale Men Blue Regular Fit Mid-Rise Mild...</td>\n",
" <td>Jeans</td>\n",
" <td>Aeropostale Men Blue Regular Fit Mid Rise Mild...</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3631</th>\n",
" <td>3632</td>\n",
" <td>ether Men Navy Blue Slim Fit Anti Microbial Co...</td>\n",
" <td>Casual Shirts</td>\n",
" <td>Ether Men Navy Blue Slim Fit Anti Microbial Co...</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26605</th>\n",
" <td>26606</td>\n",
" <td>Roadster Men White Regular Fit Mid-Rise Clean ...</td>\n",
" <td>Jeans</td>\n",
" <td>Roadster Men White Regular Fit Mid Rise Clean ...</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21322</th>\n",
" <td>21323</td>\n",
" <td>Dollar Bigboss Pack of 3 Trunks MDTR-03-PO3-4</td>\n",
" <td>Innerwear &amp; Sleapwear</td>\n",
" <td>Dollar Bigboss Pack Of 3 Trunks MDTR 03 PO3 4,...</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SERIAL NO NAME \\\n",
"25552 25553 Fort Collins Men Red Solid Padded Jacket \n",
"18639 18640 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... \n",
"18542 18543 Arrow Men Navy Blue Tapered Fit Checked Formal... \n",
"21474 21475 Hanes Charcoal Grey Thermal T-Shirt \n",
"14858 14859 Hancock Men Blue Regular Fit Striped Formal Shirt \n",
"59142 59190 Tantra Men Black Printed Round Neck T-shirt \n",
"35661 35662 Aeropostale Men Blue Regular Fit Mid-Rise Mild... \n",
"3631 3632 ether Men Navy Blue Slim Fit Anti Microbial Co... \n",
"26605 26606 Roadster Men White Regular Fit Mid-Rise Clean ... \n",
"21322 21323 Dollar Bigboss Pack of 3 Trunks MDTR-03-PO3-4 \n",
"\n",
" CATEGORY \\\n",
"25552 Men Jackets Coats \n",
"18639 Men Formal Trousers \n",
"18542 Men Formal Trousers \n",
"21474 Innerwear & Sleapwear \n",
"14858 Formal Shirts \n",
"59142 T-Shirts \n",
"35661 Jeans \n",
"3631 Casual Shirts \n",
"26605 Jeans \n",
"21322 Innerwear & Sleapwear \n",
"\n",
" DESCRIPTION CATEGORY_ID \n",
"25552 Fort Collins Men Red Solid Padded Jacket, For... 0 \n",
"18639 MANGO MAN Men Navy Blue Tailored Slim Fit Soli... 1 \n",
"18542 Arrow Men Navy Blue Tapered Fit Checked Formal... 1 \n",
"21474 Hanes Charcoal Grey Thermal T Shirt, Hanes, T... 2 \n",
"14858 Hancock Men Blue Regular Fit Striped Formal Sh... 3 \n",
"59142 Tantra Men Black Printed Round Neck T Shirt, ... 4 \n",
"35661 Aeropostale Men Blue Regular Fit Mid Rise Mild... 5 \n",
"3631 Ether Men Navy Blue Slim Fit Anti Microbial Co... 6 \n",
"26605 Roadster Men White Regular Fit Mid Rise Clean ... 5 \n",
"21322 Dollar Bigboss Pack Of 3 Trunks MDTR 03 PO3 4,... 2 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"X_train = df.loc[:1000, 'DESCRIPTION'].values\n",
"y_train = df.loc[:1000, 'CATEGORY_ID'].values\n",
"X_test = df.loc[:1000, 'DESCRIPTION'].values\n",
"y_test = df.loc[:1000, 'CATEGORY_ID'].values"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.python.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n",
"\n",
"tokenizer_obj = Tokenizer()\n",
"total_descriptions = X_train + X_test\n",
"tokenizer_obj.fit_on_texts(total_descriptions)\n",
"\n",
"# pad sequences\n",
"max_length = max([len(s.split()) for s in total_descriptions])\n",
"\n",
"#define vocabulary size\n",
"vocab_size = len(tokenizer_obj.word_index) + 1\n",
"\n",
"X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)\n",
"X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)\n",
"\n",
"X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')\n",
"X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# learn word embedding"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# build model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Build model...\n"
]
}
],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense, Embedding, LSTM, GRU\n",
"from keras.layers.embeddings import Embedding\n",
"\n",
"EMBEDDING_DIM = 100\n",
"\n",
"print('Build model...')\n",
"\n",
"model = Sequential()\n",
"model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))\n",
"model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))\n",
"model.add(Dense(1, activation='sigmoid'))\n",
"\n",
"# try using different optimizers and different optimizer configs\n",
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# train model"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\User\\Anaconda3\\lib\\site-packages\\tensorflow_core\\python\\framework\\indexed_slices.py:433: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 7853 samples, validate on 7853 samples\n",
"Epoch 1/2\n",
" - 6s - loss: -1.2412e+10 - accuracy: 0.0507 - val_loss: -2.1084e+01 - val_accuracy: 0.0506\n",
"Epoch 2/2\n",
" - 6s - loss: -1.9216e+11 - accuracy: 0.0506 - val_loss: -3.2632e+01 - val_accuracy: 0.0506\n"
]
},
{
"data": {
"text/plain": [
"<keras.callbacks.callbacks.History at 0x170704eeef0>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Train...')\n",
"\n",
"model.fit(X_train_pad, y_train, batch_size=128, epochs=2, validation_data=(X_test_pad, y_test), verbose=2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# test model"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.99995184],\n",
" [0.99995184],\n",
" [0.99995184],\n",
" [0.99995184],\n",
" [0.99995184]], dtype=float32)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Let us test some samples\n",
"\n",
"test_sample_1 = \"Puma Unisex Gery Style\"\n",
"test_sample_2 = \"Good jeans!\"\n",
"test_sample_3 = \"Maybe I like this jeans.\"\n",
"test_sample_4 = \"Not to my taste, will skip and choose another jeans\"\n",
"test_sample_5 = \"Bad jeans\"\n",
"test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5]\n",
"\n",
"test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)\n",
"test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)\n",
"\n",
"#predict\n",
"model.predict(x=test_samples_tokens_pad)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# train word2vec embedding\n",
"# text preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"\n",
"description_lines = list()\n",
"lines = df['DESCRIPTION'].values.tolist()\n",
"\n",
"for line in lines:\n",
" tokens = word_tokenize(line)\n",
" # convert to lower case\n",
" tokens = [w.lower() for w in tokens]\n",
" # remove punctuation from each word\n",
" table = str.maketrans('','', string.punctuation)\n",
" stripped = [w.translate(table) for w in tokens]\n",
" # remove remaining tokens that are not alphabetic\n",
" words = [word for word in stripped if word.isalpha()]\n",
" #filter out stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" words = [w for w in words if not w in stop_words]\n",
" description_lines.append(words)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"10000"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(description_lines)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary size: 1543\n"
]
}
],
"source": [
"from gensim.models import Word2Vec\n",
"\n",
"#train word2vec model\n",
"model = Word2Vec(sentences=description_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)\n",
"#vocab size\n",
"words = list(model.wv.vocab)\n",
"print('Vocabulary size: %d' % len(words))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# test word2vec model"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('maroon', 0.6926736235618591),\n",
" ('grey', 0.6636859774589539),\n",
" ('brown', 0.6268335580825806),\n",
" ('white', 0.6045646071434021),\n",
" ('burgundy', 0.5724130868911743),\n",
" ('orange', 0.5616232752799988),\n",
" ('yellow', 0.5448090434074402),\n",
" ('blue', 0.5399770736694336),\n",
" ('beige', 0.5213257074356079),\n",
" ('mauve', 0.5187932848930359)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar('black')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('sp', 0.8349928855895996),\n",
" ('wars', 0.7996177673339844),\n",
" ('regallo', 0.761871874332428),\n",
" ('high', 0.7507779598236084),\n",
" ('star', 0.7447826266288757),\n",
" ('powell', 0.7392982840538025),\n",
" ('ice', 0.7346605062484741),\n",
" ('dyed', 0.7304413318634033),\n",
" ('density', 0.7266365885734558),\n",
" ('connection', 0.7252252101898193)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#let's see the result of semantically reasonable word vectors\n",
"model.wv.most_similar_cosmul(positive=['woman', 'jeans'], negative=['man'])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"black\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\User\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n",
" vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n"
]
}
],
"source": [
"#odd word out\n",
"print(model.wv.doesnt_match(\"men black jeans\".split()))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('apparel', 0.6134017705917358),\n",
" ('esprit', 0.5591789484024048),\n",
" ('highlander', 0.5499693751335144),\n",
" ('ether', 0.5297696590423584),\n",
" ('wrangler', 0.4958726167678833),\n",
" ('lee', 0.49490395188331604),\n",
" ('izod', 0.4832093119621277),\n",
" ('breakbounce', 0.471257746219635),\n",
" ('realm', 0.46333152055740356),\n",
" ('aeropostale', 0.46125224232673645)]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# look up top 10 words similar to 'men'\n",
"w1 = \"men\"\n",
"model.wv.most_similar (positive=w1)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('price', 0.6506756544113159),\n",
" ('online', 0.4503338932991028),\n",
" ('india', 0.44190680980682373),\n",
" ('partywear', 0.37221819162368774),\n",
" ('hatch', 0.33273035287857056),\n",
" ('sp', 0.31096935272216797)]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# look up top 6 words similar to 'best'\n",
"w1 = \"best\"\n",
"model.wv.most_similar (positive=w1, topn=6)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#similarity between two identical words\n",
"model.wv.similarity(w1=\"black\", w2=\"black\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.6045646"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#similarity between two different words \n",
"model.wv.similarity(w1=\"black\", w2=\"white\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.62683356"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#similarity between two different words\n",
"model.wv.similarity(w1=\"black\", w2=\"brown\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'white'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#odd word out\n",
"model.wv.doesnt_match([\"brown\", \"white\", \"black\"])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"#save model\n",
"filename = 'imdb_embedding_word2vec.txt'\n",
"model.wv.save_word2vec_format(filename, binary=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# use pre-trained embedding"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"embeddings_index = {}\n",
"f = open(os.path.join('', 'imdb_embedding_word2vec.txt'), encoding = \"utf-8\")\n",
"for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" coefs = np.asarray(values[1:])\n",
" embeddings_index[word] = coefs\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 1543 unique tokens.\n",
"Shape of description tensor: (10000, 107)\n",
"Shape of category tensor: (10000,)\n"
]
}
],
"source": [
"#vectorizer the next samples into a 2D integer tensor\n",
"tokenizer_obj = Tokenizer()\n",
"tokenizer_obj.fit_on_texts(description_lines)\n",
"sequences = tokenizer_obj.texts_to_sequences(description_lines)\n",
"\n",
"#pad sequences\n",
"word_index = tokenizer_obj.word_index\n",
"print('Found %s unique tokens.' % len(word_index))\n",
"\n",
"description_pad = pad_sequences(sequences, maxlen=max_length)\n",
"category = df['CATEGORY_ID'].values\n",
"print('Shape of description tensor:', description_pad.shape)\n",
"print('Shape of category tensor:', category.shape)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"num_words = len(word_index) + 1\n",
"embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
"\n",
"for word, i in word_index.items():\n",
" if i > num_words:\n",
" continue\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" #words not found in embedding index will be all-zeros\n",
" embedding_matrix[i] = embedding_vector"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1544\n"
]
}
],
"source": [
"print(num_words)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense, Embedding, LSTM, GRU\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.initializers import Constant\n",
"\n",
"#define model\n",
"model = Sequential()\n",
"embedding_layer = Embedding(num_words, \n",
" EMBEDDING_DIM, \n",
" embeddings_initializer=Constant(embedding_matrix), \n",
" input_length=max_length, \n",
" trainable=False)\n",
"model.add(embedding_layer)\n",
"model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))\n",
"model.add(Dense(1, activation='sigmoid'))\n",
"\n",
"#try using different optimizers and different optimizer configs\n",
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# split the data into a training set and a validation set\n",
"VALIDATION_SPLIT = 0.2\n",
"\n",
"indices = np.arange(description_pad.shape[0])\n",
"np.random.shuffle(indices)\n",
"description_pad = description_pad[indices]\n",
"category = category[indices]\n",
"num_validation_samples = int(VALIDATION_SPLIT * description_pad.shape[0])\n",
"\n",
"X_train_pad = description_pad[:-num_validation_samples]\n",
"y_train = category[:-num_validation_samples]\n",
"X_test_pad = description_pad[-num_validation_samples:]\n",
"y_test = category[-num_validation_samples:]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape of X_train_pad tensor: (8000, 107)\n",
"Shape of y_train tensor: (8000,)\n",
"Shape of X_test_pad tensor: (2000, 107)\n",
"Shape of y_test tensor: (2000,)\n"
]
}
],
"source": [
"print('Shape of X_train_pad tensor:', X_train_pad.shape)\n",
"print('Shape of y_train tensor:', y_train.shape)\n",
"\n",
"print('Shape of X_test_pad tensor:', X_test_pad.shape)\n",
"print('Shape of y_test tensor:', y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train...\n",
"Train on 8000 samples, validate on 2000 samples\n",
"Epoch 1/2\n",
" - 6s - loss: -1.2647e+09 - accuracy: 0.0514 - val_loss: -1.2498e+01 - val_accuracy: 0.0435\n",
"Epoch 2/2\n",
" - 5s - loss: -7.2270e+10 - accuracy: 0.0510 - val_loss: -2.0268e+01 - val_accuracy: 0.0435\n"
]
},
{
"data": {
"text/plain": [
"<keras.callbacks.callbacks.History at 0x1706c899f98>"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Train...')\n",
"\n",
"model.fit(X_train_pad, y_train, batch_size=128, epochs=2, validation_data=(X_test_pad, y_test), verbose=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment