Commit d2786b64 by Rosa Delima Mendrofa

Query Searching

parents 74c49902 13ae1c62
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n",
" - 12S16003 Maria H. Siallagan\n",
" - 12S16026 Yolanda Nainggolan\n",
" - 12S16036 Prima Hutapea\n",
" - 12S16049 Rosa Delima Mendrofa"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'dataset_STBI.xml'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-0410f424fcaa>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature_extraction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCountVectorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mminidom\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdcmnt_xml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"dataset_STBI.xml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\minidom.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, parser, bufsize)\u001b[0m\n\u001b[0;32m 1956\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbufsize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1957\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1958\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1959\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1960\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpulldom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\expatbuilder.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, namespaces)\u001b[0m\n\u001b[0;32m 908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 909\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 910\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 911\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparseFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 912\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'dataset_STBI.xml'"
]
}
],
"source": [
"import string\n",
"import re\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import xml.dom.minidom as minidom\n",
"dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n",
"all_profile = dcmnt_xml.getElementsByTagName('SONG')\n",
"all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n",
"all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n",
"all_pub = dcmnt_xml.getElementsByTagName('PUB')\n",
"all_page = dcmnt_xml.getElementsByTagName('PAGE')\n",
"\n",
"N_DOC_sample = len(all_doc_no)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"all_sentence_doc_sample = []\n",
"for i in range(N_DOC_sample):\n",
" sentence_doc_sample = ' '+ all_text[i].firstChild.data\n",
" all_sentence_doc_sample.append(sentence_doc_sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preprocessing "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tokens_doc = []"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def remove_punc_tokenize(sentence):\n",
" tokens = []\n",
" for punctuation in string.punctuation:\n",
" sentence = sentence.replace(punctuation,\" \")\n",
" \n",
" sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n",
" for w in CountVectorizer().build_tokenizer()(sentence):\n",
" tokens.append(w)\n",
" return tokens"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"stop_words = set(stopwords.words('english'))\n",
"def stop_word_token(tokens):\n",
" tokens = [w for w in tokens if not w in stop_words]\n",
" return tokens\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stop_word_token(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer\n",
"stemmer = PorterStemmer()\n",
"def stemming(tokens):\n",
" for i in range(0, len(tokens)):\n",
" if (tokens[i] != stemmer.stem(tokens[i])):\n",
" tokens[i] = stemmer.stem(tokens[i])\n",
" return tokens\n",
"\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stemming(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = []\n",
"for i in range(N_DOC):\n",
" for w in tokens_doc[i]:\n",
" all_tokens.append(w)\n",
"\n",
"new_sentence = ' '.join([w for w in all_tokens])\n",
"\n",
"for w in CountVectorizer().build_tokenizer()(new_sentence):\n",
" all_tokens.append(w)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = set(all_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from itertools import count\n",
"try: \n",
" from itertools import izip as zip\n",
"except ImportError:\n",
" pass\n",
"proximity_index = {}\n",
"for token in all_tokens:\n",
" dict_doc_position = {}\n",
" for n in range(N_DOC):\n",
" if(token in tokens_doc[n]):\n",
" dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n",
" proximity_index[token] = dict_doc_position"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n",
"for key, value in proximity_index.items():\n",
" print (key, value)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -3,15 +3,13 @@ resource_package = __name__
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count
import collections
import math
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
......@@ -50,20 +48,37 @@ def generate_ngrams(data, n):
return ngram, result
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
tokens = [w for w in tokens if not w in stop_words]
return tokens
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
for i in range(0, len(tokens)):
if (tokens[i] != stemmer.stem(tokens[i])):
tokens[i] = stemmer.stem(tokens[i])
return tokens
def main(query):
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_headline = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_headline.append(node.text)
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
......@@ -72,7 +87,7 @@ def main(query):
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_headline[i] + all_text[i])
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
......@@ -80,7 +95,7 @@ def main(query):
for i in range(N_DOC):
tokens_doc[i] = to_lower(tokens_doc[i])
stop_words = set(stopwords.words('indonesian'))
stop_words = set(stopwords.words('english'))
stopping = []
for i in range(N_DOC):
......@@ -189,7 +204,7 @@ def main(query):
score*=idf[i] #tf * idf
idx = all_doc_no[i]
judul = all_headline[i]
judul = all_song[i]
dic['docno'] = idx
dic['judul'] = judul
......@@ -205,20 +220,25 @@ def main(query):
return hasil
def detail(nomor):
<<<<<<< HEAD:SearchEngine-master/SearchEngine/InvertedIndexSimulator/inverted/main.py
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
=======
tree = et()
tree.parse("apps/data/dataset_STBI.xml")
>>>>>>> 13ae1c6214da83348ebb0752338974874c4f66ae:SearchEngine/InvertedIndexSimulator/inverted/main.py
all_doc_no = []
all_headline = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
# all_headline.append(node.text.replace("\n"," "))
all_headline.append(node.text)
head = all_headline
# all_song.append(node.text.replace("\n"," "))
all_song.append(node.text)
head = all_song
for node in tree.iter("LYRICS"):
# all_text.append(node.text.replace("\n"," "))
......@@ -233,5 +253,5 @@ def detail(nomor):
check = all_doc_no[i]
if check == id:
text = all_text[i]
judul = all_headline[i]
judul = all_song[i]
return text,judul
\ No newline at end of file
......@@ -55,6 +55,15 @@ footer {
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: auto;
}
.carda {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: max-content;
}
.jumbotron {
......
......@@ -5,35 +5,103 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
white-space: nowrap;
}
#middleboxa{
float:left;
white-space: nowrap;
}
#rightbox{
float:right;
white-space: nowrap;
}
</style>
</head>
<body>
<main>
<div id="content">
<article class="card">
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
<center><h1>Dataset</h1><br>
<table style="width:100%">
<tr>
<th>DOCNO</th>
<th>SONG</th>
<th>ARTIST</th>
<th>LYRICS</th>
</tr>
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Dataset</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
{% for l in LYRICS %}
<tr>
<td>{{ i }}</td>
<td>{{ j }}</td>
<td>{{ k }}</td>
<td>{{ l }}</td>
</tr>
{% endfor %}
<div id = "leftbox">
<table>
<tr>
<th>DOCNO</th>
</tr>
{% for i in DOCNO %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table align="left">
<tr>
<th>SONG</th>
</tr>
{% for i in SONG %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>ARTIST</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>LYRICS</th>
</tr>
{% for i in LYRICS %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</table>
</center>
</article>
</article>
</div>
......@@ -46,9 +114,13 @@
</body>
<script>
function pageRedirect() {
window.location.href = "/preprocessing";
}
function pageRedirect_prev() {
window.location.href = "/home";
}
function pageRedirect_next() {
window.location.href = "/preprocessing";
}
</script>
</html>
......@@ -11,25 +11,44 @@
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center>
<table style="width:100%">
<tr>
<th>Apa judulnya ya?</th>
</tr>
{% for i in indexnya %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect() {
window.location.href = "/index";
}
function pageRedirect_prev() {
window.location.href = "/preprocessing4";
}
function pageRedirect_next() {
window.location.href = "/index";
}
</script>
</html>
......@@ -11,25 +11,45 @@
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button>
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
<center><h1>Text Preprocessing</h1><br></center>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 1</strong></p>
<p><strong>After Punctuation Removal and Tokenization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect() {
window.location.href = "/indexing";
}
function pageRedirect_prev() {
window.location.href = "/dataframe";
}
function pageRedirect_next() {
window.location.href = "/preprocessing2";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 2</strong></p>
<p><strong>After Case Folding</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing";
}
function pageRedirect_next() {
window.location.href = "/preprocessing3";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 3</strong></p>
<p><strong>After Stopwords Removal</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing2";
}
function pageRedirect_next() {
window.location.href = "/preprocessing4";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 4</strong></p>
<p><strong>After Normalization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing3";
}
function pageRedirect_next() {
window.location.href = "/indexing";
}
</script>
</html>
......@@ -44,7 +44,11 @@
<div class="col-lg-4">
<div class="testimonial-item mx-auto mb-5 mb-lg-0">
<<<<<<< HEAD:SearchEngine-master/SearchEngine/InvertedIndexSimulator/templates/apps/result.html
<img class="img-fluid rounded-circle mb-3" src="../../static/img/billboard-logo-2016-1548-768x433.jpeg" alt="">
=======
<img class="img-fluid rounded-circle mb-3" src="../../static/img/hkbp.jpg" alt="">
>>>>>>> 13ae1c6214da83348ebb0752338974874c4f66ae:SearchEngine/InvertedIndexSimulator/templates/apps/result.html
<h5><a href="/lyric">Lagu No:{{ j.docno }}</a></h5>
<h5>"{{ j.judul }}"</h5>
<p class="font-weight-light mb-0">score :{{ j.score }}</p>
......
......@@ -10,6 +10,12 @@ urlpatterns = [
path('', views.home),
path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing),
<<<<<<< HEAD:SearchEngine-master/SearchEngine/InvertedIndexSimulator/urls.py
=======
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
>>>>>>> 13ae1c6214da83348ebb0752338974874c4f66ae:SearchEngine/InvertedIndexSimulator/urls.py
path('indexing/', views.indexing),
path('index/', views.index),
path('result/', views.result),
......
......@@ -22,6 +22,12 @@ urlpatterns = [
path('', views.home),
path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing),
<<<<<<< HEAD:SearchEngine-master/SearchEngine/SearchEngine/urls.py
=======
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
>>>>>>> 13ae1c6214da83348ebb0752338974874c4f66ae:SearchEngine/SearchEngine/urls.py
path('indexing/', views.indexing),
path('index/', views.index),
path('result/', views.result),
......
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/landing-page.min.css" rel="stylesheet">
</head>
<body>
<header>
<div class="jumbotron">
<h1>Song Lyric Search Engine<br>- Simulator -</h1>
<p>Search engine yang pake inverted index untuk indexing nya</p>
</div>
</header>
<main>
<div id="content">
<article class="card">
<center><h1>Pilih Dataset</h1><br>
<table>
<tr>
<th><button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>International Billboard Song </span></button></th>
<td><button class="button" style="vertical-align:middle"><span>Indonesian Song </span></button></td>
</tr>
</table>
</center>
</article>
</div>
</main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body>
<script>
function pageRedirect() {
window.location.href = "/dataframe";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
</div>
<div class="row">
<center><h1 style="font-size:45px">Searching!<br></h1>
<p style="font-size:20px"><strong>Silahkan masukkan lirik dari lagu yang ingin Anda temukan</strong></p>
<form method="POST" action="/result/">
{% csrf_token %}
<div class="form-row">
<input type="text" name="querysearch" placeholder="Masukkan Query Anda..."> <br>
<button type="submit">Cari!</button>
</div>
</form>
</div>
</center>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/indexing";
}
</script>
</html>
from django.shortcuts import render
from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main
import pandas as pd
import xml.etree.ElementTree as et
def home(request):
return render(request, 'apps/home.html')
def dataframe(request):
parse_data = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
data = parse_data.getroot()
df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
rows = []
for node in data:
s_docno = node.find("DOCNO").text if node is not None else None
s_song = node.find("SONG").text if node is not None else None
s_artist = node.find("ARTIST").text if node is not None else None
s_lyrics = node.find("LYRICS").text if node is not None else None
rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
DataFrame = pd.DataFrame(rows, columns = df_cols)
dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
nilai = list(dictionary.values())
nomornya = list(dictionary.keys())
lagunya = [sublist[0] for sublist in nilai]
artisnya = [sublist[1] for sublist in nilai]
liriknya = [sublist[2] for sublist in nilai]
context = {"DOCNO": nomornya, "SONG": lagunya, "ARTIST": artisnya, "LYRICS": liriknya}
return render(request, 'apps/dataframe.html', context)
def preprocessing(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing.html', context)
def preprocessing2(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing2.html', context)
def preprocessing3(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing3.html', context)
def preprocessing4(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing4.html', context)
def indexing(request):
from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[]
for i in range(N_DOC):
for j in tokens_doc[i]:
all_tokens.append(j)
new_sentences = ' '.join([w for w in all_tokens])
for j in CountVectorizer().build_tokenizer()(new_sentences):
all_tokens.append(j)
all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya}
return render(request, 'apps/indexing.html', context)
def index(request):
return render(request, 'apps/index.html')
def lyric(request,id):
text, judul = main.detail(id)
content={
'no': id,
'judul':judul,
'text':text
}
return render(request, 'apps/lyric.html', content)
def result(request):
#%%
# proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
# for key, value in proximity_index.items():
# # print (key, value)
if request.method == 'POST':
query = request.POST['querysearch']
hasil= main.main(query)
content={
'hasil':hasil,
'query':query
}
return render(request, 'apps/result.html', content)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment