Commit 2d25e3c9 by Yolanda Nainggolan

add indexing

parent e0d68fdf
...@@ -249,4 +249,4 @@ def detail(nomor): ...@@ -249,4 +249,4 @@ def detail(nomor):
if check == id: if check == id:
text = all_text[i] text = all_text[i]
judul = all_song[i] judul = all_song[i]
return text,judul return text,judul
\ No newline at end of file \ No newline at end of file
...@@ -5,39 +5,81 @@ ...@@ -5,39 +5,81 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title> <title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet"> <link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head> </head>
<body> <body>
<main> <main>
<div id="content"> <div id="content">
<article class="card"> <article class="card">
<div> <div>
<div> <div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button> <button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div> </div>
<div align="right"> <div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button> <button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div> </div>
</div> </div>
<center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center> <center><h1>Proximity Index</h1><br></center>
<table style="width:100%"> <article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
<tr>
<th>Apa judulnya ya?</th>
</tr>
{% for i in indexnya %} <div id = "leftbox">
<tr> <table>
<td>{{ i }}</td> <tr>
</tr> <th>Token</th>
{% endfor %} </tr>
{% for i in words %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middleboxb">
<table align="left">
<tr>
<th>Index</th>
</tr>
{% for i in freq %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</table> </article>
</article> </article>
</div> </div>
</main> </main>
<!-- <footer>
<p>&copy; STBI-2020-03</p>
</footer> -->
</body> </body>
......
...@@ -3,6 +3,19 @@ from django.http import HttpResponse ...@@ -3,6 +3,19 @@ from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main from InvertedIndexSimulator.inverted import main
import pandas as pd import pandas as pd
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
import collections
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
def home(request): def home(request):
return render(request, 'apps/home.html') return render(request, 'apps/home.html')
...@@ -201,68 +214,53 @@ def preprocessing4(request): ...@@ -201,68 +214,53 @@ def preprocessing4(request):
def indexing(request): def indexing(request):
import string
import re
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree import xml.dom.minidom as minidom
tree = ElementTree() dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_doc_no = [] all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_song = [] all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = [] all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
for node in tree.iter("DOCNO"): all_page = dcmnt_xml.getElementsByTagName('PAGE')
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text) N_DOC = len(all_doc_no)
all_sentence_doc = [] all_sentence_doc_sample = []
for i in range(N_DOC): for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i]) sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
tokens_doc = []
for i in range(N_DOC): tokens_doc = []
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i]) tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc_sample[i]))
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i]) tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)]) tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i]) tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[] all_tokens = []
for i in range(N_DOC): for i in range(N_DOC):
for j in tokens_doc[i]: for w in tokens_doc[i]:
all_tokens.append(j) all_tokens.append(w)
new_sentences = ' '.join([w for w in all_tokens])
for j in CountVectorizer().build_tokenizer()(new_sentences): new_sentence = ' '.join([w for w in all_tokens])
all_tokens.append(j)
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens) all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {} proximity_index = {}
for token in all_tokens: for token in all_tokens:
dict_doc_position = {} dict_doc_position = {}
...@@ -271,12 +269,16 @@ def indexing(request): ...@@ -271,12 +269,16 @@ def indexing(request):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token] dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items())) proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya} import json
indexnya = json.loads(json.dumps(proximity_index))
words = indexnya.keys()
freq = indexnya.values()
context = {"words": words, "freq": freq}
return render(request, 'apps/indexing.html', context) return render(request, 'apps/indexing.html', context)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment