Commit 2d25e3c9 by Yolanda Nainggolan

add indexing

parent e0d68fdf
......@@ -249,4 +249,4 @@ def detail(nomor):
if check == id:
text = all_text[i]
judul = all_song[i]
return text,judul
\ No newline at end of file
return text,judul
\ No newline at end of file
......@@ -5,39 +5,81 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center>
<table style="width:100%">
<tr>
<th>Apa judulnya ya?</th>
</tr>
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><h1>Proximity Index</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
{% for i in indexnya %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
<div id = "leftbox">
<table>
<tr>
<th>Token</th>
</tr>
{% for i in words %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middleboxb">
<table align="left">
<tr>
<th>Index</th>
</tr>
{% for i in freq %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</table>
</article>
</article>
</div>
</main>
<!-- <footer>
<p>&copy; STBI-2020-03</p>
</footer> -->
</body>
......
......@@ -3,6 +3,19 @@ from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main
import pandas as pd
import xml.etree.ElementTree as et
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
import collections
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
def home(request):
return render(request, 'apps/home.html')
......@@ -201,68 +214,53 @@ def preprocessing4(request):
def indexing(request):
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
import xml.dom.minidom as minidom
dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_text)
N_DOC = len(all_doc_no)
all_sentence_doc = []
all_sentence_doc_sample = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
tokens_doc = []
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc_sample[i]))
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[]
all_tokens = []
for i in range(N_DOC):
for j in tokens_doc[i]:
all_tokens.append(j)
new_sentences = ' '.join([w for w in all_tokens])
for w in tokens_doc[i]:
all_tokens.append(w)
for j in CountVectorizer().build_tokenizer()(new_sentences):
all_tokens.append(j)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
......@@ -271,12 +269,16 @@ def indexing(request):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya}
import json
indexnya = json.loads(json.dumps(proximity_index))
words = indexnya.keys()
freq = indexnya.values()
context = {"words": words, "freq": freq}
return render(request, 'apps/indexing.html', context)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment