update

e1bebfc7 · Febby Simanjuntak · f27cbbb1 · e1bebfc7 · e1bebfc7 · e1bebfc7
Commit e1bebfc7 authored May 28, 2020 by Febby Simanjuntak
Showing with 205 additions and 0 deletions

STBI_Project-checkpoint.ipynb .ipynb_checkpoints/STBI_Project-checkpoint.ipynb +0 -0

TFIDF-checkpoint.ipynb .ipynb_checkpoints/TFIDF-checkpoint.ipynb +205 -0

STBI_Project.ipynb STBI_Project.ipynb +0 -0

No files found.
--- a/.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
+++ b/.ipynb_checkpoints/STBI_Project-checkpoint.ipynb
--- a/.ipynb_checkpoints/TFIDF-checkpoint.ipynb
+++ b/.ipynb_checkpoints/TFIDF-checkpoint.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset and Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    " \n",
+    "from sklearn.feature_extraction.text import TfidfTransformer\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    " \n",
+    "# this is a very toy example, do not try this at home unless you want to understand the usage differences\n",
+    "docs=[\"the\", \"to\", \"ect\", \"and\", \"for\", \"of\", \"a\", \"you\", \"hou\", \"in\", \"on\", \"is\", \"this\", \"enron\", \"i\", \"be\", \"that\", \"will\",\n",
+    "      \"have\", \"with\", \"your\",\"at\", \"we\", \"are\", \"it\", \"by\", \"com\", \"as\", \"from\", \"gas\", \"or\",\"not\", \"not\", \"me\", \"deal\", \"if\",\n",
+    "      \"meter\",\"hpl\", \"please\",\"re\", \"e\", \"any\", \"our\", \"corp\",\"can\", \"d\", \"all\", \"has\", \"was\", \"know\", \"need\", \"an\", \"forwarded\", \n",
+    "      \"new\", \"t\", \"may\", \"up\", \"j\",\"should\", \"do\", \"am\", \"out\", \"see\", \"no\", \"there\", \"price\", \"daren\", \"but\", \"been\", \"company\", \n",
+    "      \"I\", \"these\", \"let\", \"so\", \"would\", \"m\", \"into\", \"xls\", \"farmer\", \"attached\", \"us\", \"information\", \"they\", \"message\", \n",
+    "      \"day\", \"time\", \"my\", \"one\", \"what\", \"only\", \"http\", \"th\", \"volume\", \"mail\", \"contract\", \"which\", \"month\",\n",
+    "      \"more\", \"robert\", \"sitara\", \"obout\", \"texas\", \"nom\", \"energy\", \"pec\", \"questions\", \"www\", \"deals\", \"volumes\", \"pm\", \"ena\",\n",
+    "      \"now\", \"their\", \"file\", \"some\", \"email\", \"just\", \"also\", \"call\", \"change\", \"other\", \"here\", \"like\", \"b\", \"flow\", \"net\", \n",
+    "      \"following\", \"p\", \"production\",\"when\", \"over\", \"back\", \"want\", \"original\", \"them\", \"below\", \"o\", \"ticket\", \"c\", \"he\",\n",
+    "      \"could\", \"make\", \"inc\", \"report\", \"march\", \"contact\", \"were\", \"days\", \"list\", \"nomination\", \"system\", \"who\", \"april\", \n",
+    "      \"number\", \"sale\", \"don\", \"its\", \"first\", \"thanks\", \"business\",\"help\", \"per\", \"through\", \"july\", \"forward\", \"font\", \"free\", \n",
+    "      \"daily\", \"use\", \"order\", \"today\", \"r\", \"had\", \"fw\", \"set\", \"plant\", \"statements\", \"go\", \"gary\", \"oil\", \"line\", \"sales\", \n",
+    "      \"w\", \"effective\", \"well\", \"tenaska\", \"take\",\"june\",\"x\", \"within\",\"nbsp\", \"she\", \"how\", \"north\", \"america\", \"being\", \n",
+    "      \"under\", \"next\", \"week\", \"than\", \"january,\" \"la\"\n",
+    "     ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Initialize CountVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#instantiate CountVectorizer()\n",
+    "cv=CountVectorizer()\n",
+    " \n",
+    "# this steps generates word counts for the words in your docs\n",
+    "word_count_vector=cv.fit_transform(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_count_vector.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute the IDF values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)\n",
+    "tfidf_transformer.fit(word_count_vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print idf values\n",
+    "df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=[\"idf_weights\"])\n",
+    " \n",
+    "# sort ascending\n",
+    "df_idf.sort_values(by=['idf_weights'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute the TFIDF score for your documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# count matrix\n",
+    "count_vector=cv.transform(docs)\n",
+    " \n",
+    "# tf-idf scores\n",
+    "tf_idf_vector=tfidf_transformer.transform(count_vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_names = cv.get_feature_names()\n",
+    " \n",
+    "#get tfidf vector for first document\n",
+    "first_document_vector=tf_idf_vector[0]\n",
+    " \n",
+    "#print the scores\n",
+    "df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=[\"tfidf\"])\n",
+    "df.sort_values(by=[\"tfidf\"],ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tfidfvectorizer Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer \n",
+    " \n",
+    "# settings that you use for count vectorizer will go here\n",
+    "tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
+    " \n",
+    "# just send in all your docs here\n",
+    "tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get the first vector out (for the first document)\n",
+    "first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]\n",
+    " \n",
+    "# place tf-idf values in a pandas data frame\n",
+    "df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=[\"tfidf\"])\n",
+    "df.sort_values(by=[\"tfidf\"],ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
+    " \n",
+    "# just send in all your docs here\n",
+    "fitted_vectorizer=tfidf_vectorizer.fit(docs)\n",
+    "tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/STBI_Project.ipynb
+++ b/STBI_Project.ipynb