Commit f27cbbb1 by Febby Simanjuntak

inverted done

parent 3e9ccf70
......@@ -632,64 +632,78 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(row):\n",
" if row is None or row is '':\n",
" tokens = \"\"\n",
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
"def tokenize(text):\n",
" words = word_tokenize(text)\n",
" return words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regular expressions to remove unnecessary characters"
"### Normalization"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n",
"def reg_expressions(row):\n",
" tokens = []\n",
" try:\n",
" for token in row:\n",
" token = token.lower() # make all characters lower case\n",
" token = re.sub(r'[\\W\\d]', \"\", token)\n",
" token = token[:maxtokenlen] # truncate token\n",
" tokens.append(token)\n",
" except:\n",
" token = \"\"\n",
" tokens.append(token)\n",
" return tokens"
"def normalize():\n",
" words = to_lowercase(df)\n",
" data = remove_stopwords(df)\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"norm = normalize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stop-word removal"
"### Inverted Index"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def stop_word_removal(row):\n",
" token = [token for token in row if token not in stopwords]\n",
" token = filter(None, token)\n",
" return token"
"\n",
"def create_Inverted_index(all_unique_documents):\n",
" inverted_index = {}\n",
" for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
]
},
{
......
......@@ -632,64 +632,78 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(row):\n",
" if row is None or row is '':\n",
" tokens = \"\"\n",
" else:\n",
" tokens = str(row).split(\" \")[:maxtokens]\n",
" return tokens"
"def tokenize(text):\n",
" words = word_tokenize(text)\n",
" return words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regular expressions to remove unnecessary characters"
"### Normalization"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"def to_lowercase(data):\n",
" new_word = []\n",
" for word in data.columns:\n",
" word = word.lower()\n",
" new_word.append(word)\n",
" return new_word\n",
"\n",
"def remove_stopwords(data):\n",
" for col in data.columns:\n",
" if col in stopwords.words('english'):\n",
" data = data.drop(columns = col)\n",
" return data;\n",
"\n",
"def reg_expressions(row):\n",
" tokens = []\n",
" try:\n",
" for token in row:\n",
" token = token.lower() # make all characters lower case\n",
" token = re.sub(r'[\\W\\d]', \"\", token)\n",
" token = token[:maxtokenlen] # truncate token\n",
" tokens.append(token)\n",
" except:\n",
" token = \"\"\n",
" tokens.append(token)\n",
" return tokens"
"def normalize():\n",
" words = to_lowercase(df)\n",
" data = remove_stopwords(df)\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"norm = normalize()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stop-word removal"
"### Inverted Index"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def stop_word_removal(row):\n",
" token = [token for token in row if token not in stopwords]\n",
" token = filter(None, token)\n",
" return token"
"\n",
"def create_Inverted_index(all_unique_documents):\n",
" inverted_index = {}\n",
" for doc_id in range(len(all_unique_documents)):\n",
" for term in all_unique_documents[doc_id]:\n",
" if term not in inverted_index:\n",
" inverted_index[term] = []\n",
" inverted_index[term].append(doc_id) \n",
" return inverted_index"
]
},
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment