Commit e1bebfc7 by Febby Simanjuntak

update

parent f27cbbb1
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset and Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
" \n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
" \n",
"# this is a very toy example, do not try this at home unless you want to understand the usage differences\n",
"docs=[\"the\", \"to\", \"ect\", \"and\", \"for\", \"of\", \"a\", \"you\", \"hou\", \"in\", \"on\", \"is\", \"this\", \"enron\", \"i\", \"be\", \"that\", \"will\",\n",
" \"have\", \"with\", \"your\",\"at\", \"we\", \"are\", \"it\", \"by\", \"com\", \"as\", \"from\", \"gas\", \"or\",\"not\", \"not\", \"me\", \"deal\", \"if\",\n",
" \"meter\",\"hpl\", \"please\",\"re\", \"e\", \"any\", \"our\", \"corp\",\"can\", \"d\", \"all\", \"has\", \"was\", \"know\", \"need\", \"an\", \"forwarded\", \n",
" \"new\", \"t\", \"may\", \"up\", \"j\",\"should\", \"do\", \"am\", \"out\", \"see\", \"no\", \"there\", \"price\", \"daren\", \"but\", \"been\", \"company\", \n",
" \"I\", \"these\", \"let\", \"so\", \"would\", \"m\", \"into\", \"xls\", \"farmer\", \"attached\", \"us\", \"information\", \"they\", \"message\", \n",
" \"day\", \"time\", \"my\", \"one\", \"what\", \"only\", \"http\", \"th\", \"volume\", \"mail\", \"contract\", \"which\", \"month\",\n",
" \"more\", \"robert\", \"sitara\", \"obout\", \"texas\", \"nom\", \"energy\", \"pec\", \"questions\", \"www\", \"deals\", \"volumes\", \"pm\", \"ena\",\n",
" \"now\", \"their\", \"file\", \"some\", \"email\", \"just\", \"also\", \"call\", \"change\", \"other\", \"here\", \"like\", \"b\", \"flow\", \"net\", \n",
" \"following\", \"p\", \"production\",\"when\", \"over\", \"back\", \"want\", \"original\", \"them\", \"below\", \"o\", \"ticket\", \"c\", \"he\",\n",
" \"could\", \"make\", \"inc\", \"report\", \"march\", \"contact\", \"were\", \"days\", \"list\", \"nomination\", \"system\", \"who\", \"april\", \n",
" \"number\", \"sale\", \"don\", \"its\", \"first\", \"thanks\", \"business\",\"help\", \"per\", \"through\", \"july\", \"forward\", \"font\", \"free\", \n",
" \"daily\", \"use\", \"order\", \"today\", \"r\", \"had\", \"fw\", \"set\", \"plant\", \"statements\", \"go\", \"gary\", \"oil\", \"line\", \"sales\", \n",
" \"w\", \"effective\", \"well\", \"tenaska\", \"take\",\"june\",\"x\", \"within\",\"nbsp\", \"she\", \"how\", \"north\", \"america\", \"being\", \n",
" \"under\", \"next\", \"week\", \"than\", \"january,\" \"la\"\n",
" ]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Initialize CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#instantiate CountVectorizer()\n",
"cv=CountVectorizer()\n",
" \n",
"# this steps generates word counts for the words in your docs\n",
"word_count_vector=cv.fit_transform(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"word_count_vector.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute the IDF values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)\n",
"tfidf_transformer.fit(word_count_vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print idf values\n",
"df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=[\"idf_weights\"])\n",
" \n",
"# sort ascending\n",
"df_idf.sort_values(by=['idf_weights'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute the TFIDF score for your documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# count matrix\n",
"count_vector=cv.transform(docs)\n",
" \n",
"# tf-idf scores\n",
"tf_idf_vector=tfidf_transformer.transform(count_vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"feature_names = cv.get_feature_names()\n",
" \n",
"#get tfidf vector for first document\n",
"first_document_vector=tf_idf_vector[0]\n",
" \n",
"#print the scores\n",
"df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=[\"tfidf\"])\n",
"df.sort_values(by=[\"tfidf\"],ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tfidfvectorizer Usage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer \n",
" \n",
"# settings that you use for count vectorizer will go here\n",
"tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
" \n",
"# just send in all your docs here\n",
"tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get the first vector out (for the first document)\n",
"first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]\n",
" \n",
"# place tf-idf values in a pandas data frame\n",
"df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=[\"tfidf\"])\n",
"df.sort_values(by=[\"tfidf\"],ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
" \n",
"# just send in all your docs here\n",
"fitted_vectorizer=tfidf_vectorizer.fit(docs)\n",
"tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment