Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
SearchEngine
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rosa Delima Mendrofa
SearchEngine
Commits
2d25e3c9
Commit
2d25e3c9
authored
May 08, 2020
by
Yolanda Nainggolan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add indexing
parent
e0d68fdf
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
111 additions
and
67 deletions
+111
-67
views.cpython-37.pyc
...e/InvertedIndexSimulator/__pycache__/views.cpython-37.pyc
+0
-0
main.cpython-37.pyc
...edIndexSimulator/inverted/__pycache__/main.cpython-37.pyc
+0
-0
main.py
SearchEngine/InvertedIndexSimulator/inverted/main.py
+2
-2
indexing.html
...ngine/InvertedIndexSimulator/templates/apps/indexing.html
+62
-20
views.py
SearchEngine/InvertedIndexSimulator/views.py
+47
-45
No files found.
SearchEngine/InvertedIndexSimulator/__pycache__/views.cpython-37.pyc
View file @
2d25e3c9
No preview for this file type
SearchEngine/InvertedIndexSimulator/inverted/__pycache__/main.cpython-37.pyc
View file @
2d25e3c9
No preview for this file type
SearchEngine/InvertedIndexSimulator/inverted/main.py
View file @
2d25e3c9
...
...
@@ -249,4 +249,4 @@ def detail(nomor):
if
check
==
id
:
text
=
all_text
[
i
]
judul
=
all_song
[
i
]
return
text
,
judul
\ No newline at end of file
return
text
,
judul
\ No newline at end of file
SearchEngine/InvertedIndexSimulator/templates/apps/indexing.html
View file @
2d25e3c9
...
...
@@ -5,39 +5,81 @@
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1"
>
<title>
Song Lyric Search Engine
</title>
<link
href=
"../../static/assets/css/dataframe.min.css"
rel=
"stylesheet"
>
<style>
#leftbox
{
text-align
:
center
;
float
:
left
;
white-space
:
nowrap
;
}
#middlebox
{
float
:
left
;
text-align
:
center
;
white-space
:
nowrap
;
}
#middleboxb
{
float
:
left
;
text-align
:
left
;
white-space
:
nowrap
;
}
</style>
</head>
<body>
<main>
<div
id=
"content"
>
<article
class=
"card"
>
<div>
<div>
<button
onclick=
"pageRedirect_prev()"
class=
"button"
style=
"vertical-align:middle"
><span>
Previous
</span></button>
</div>
<div
align=
"right"
>
<button
onclick=
"pageRedirect_next()"
class=
"button"
style=
"vertical-align:middle"
><span>
Next
</span></button>
</div>
</div>
<center><h1>
Indexing
</h1><br></center>
<p><strong>
Dengan Proximity Index
</strong></p><br></center>
<table
style=
"width:100%"
>
<tr>
<th>
Apa judulnya ya?
</th>
</tr>
<div>
<div>
<button
onclick=
"pageRedirect_prev()"
class=
"button"
style=
"vertical-align:middle"
><span>
Previous
</span></button>
</div>
<div
align=
"right"
>
<button
onclick=
"pageRedirect_next()"
class=
"button"
style=
"vertical-align:middle"
><span>
Next
</span></button>
</div>
</div>
<center><h1>
Proximity Index
</h1><br></center>
<article
class=
"carda"
style=
"overflow-x:scroll; overflow-y:scroll;"
>
{% for i in indexnya %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
<div
id =
"leftbox"
>
<table>
<tr>
<th>
Token
</th>
</tr>
{% for i in words %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
<div
id =
"middleboxb"
>
<table
align=
"left"
>
<tr>
<th>
Index
</th>
</tr>
{% for i in freq %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
</table>
</article>
</article>
</div>
</main>
<!-- <footer>
<p>© STBI-2020-03</p>
</footer> -->
</body>
...
...
SearchEngine/InvertedIndexSimulator/views.py
View file @
2d25e3c9
...
...
@@ -3,6 +3,19 @@ from django.http import HttpResponse
from
InvertedIndexSimulator.inverted
import
main
import
pandas
as
pd
import
xml.etree.ElementTree
as
et
import
string
import
re
from
sklearn.feature_extraction.text
import
CountVectorizer
import
xml.dom.minidom
as
minidom
import
collections
from
itertools
import
count
try
:
from
future_builtins
import
zip
except
ImportError
:
# not 2.6+ or is 3.x
try
:
from
itertools
import
izip
as
zip
# < 2.5 or 3.x
except
ImportError
:
pass
def
home
(
request
):
return
render
(
request
,
'apps/home.html'
)
...
...
@@ -201,68 +214,53 @@ def preprocessing4(request):
def
indexing
(
request
):
import
string
import
re
from
sklearn.feature_extraction.text
import
CountVectorizer
from
xml.etree.ElementTree
import
ElementTree
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
import
xml.dom.minidom
as
minidom
dcmnt_xml
=
minidom
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
dcmnt_xml
.
getElementsByTagName
(
'DOCNO'
)
all_profile
=
dcmnt_xml
.
getElementsByTagName
(
'SONG'
)
all_date
=
dcmnt_xml
.
getElementsByTagName
(
'ARTIST'
)
all_text
=
dcmnt_xml
.
getElementsByTagName
(
'LYRICS'
)
all_pub
=
dcmnt_xml
.
getElementsByTagName
(
'PUB'
)
all_page
=
dcmnt_xml
.
getElementsByTagName
(
'PAGE'
)
N_DOC
=
len
(
all_
text
)
N_DOC
=
len
(
all_
doc_no
)
all_sentence_doc
=
[]
all_sentence_doc
_sample
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
tokens_doc
=
[]
sentence_doc_sample
=
' '
+
all_text
[
i
]
.
firstChild
.
data
all_sentence_doc_sample
.
append
(
sentence_doc_sample
)
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
to_lower
(
tokens_doc
[
i
]
)
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc_sample
[
i
])
)
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
stop_word_token
(
tokens_doc
[
i
])
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
([
w
for
w
in
tokens_doc
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
stemming
(
tokens_doc
[
i
])
all_tokens
=
[]
all_tokens
=
[]
for
i
in
range
(
N_DOC
):
for
j
in
tokens_doc
[
i
]:
all_tokens
.
append
(
j
)
new_sentences
=
' '
.
join
([
w
for
w
in
all_tokens
])
for
w
in
tokens_doc
[
i
]:
all_tokens
.
append
(
w
)
for
j
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentences
):
all_tokens
.
append
(
j
)
new_sentence
=
' '
.
join
([
w
for
w
in
all_tokens
])
for
w
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentence
):
all_tokens
.
append
(
w
)
all_tokens
=
set
(
all_tokens
)
from
itertools
import
count
try
:
from
future_builtins
import
zip
except
ImportError
:
# not 2.6+ or is 3.x
try
:
from
itertools
import
izip
as
zip
# < 2.5 or 3.x
except
ImportError
:
pass
proximity_index
=
{}
for
token
in
all_tokens
:
dict_doc_position
=
{}
...
...
@@ -271,12 +269,16 @@ def indexing(request):
dict_doc_position
[
all_doc_no
[
n
]
.
firstChild
.
data
]
=
[
i
+
1
for
i
,
j
in
zip
(
count
(),
tokens_doc
[
n
])
if
j
==
token
]
proximity_index
[
token
]
=
dict_doc_position
import
collections
proximity_index
=
collections
.
OrderedDict
(
sorted
(
proximity_index
.
items
()))
for
key
,
value
in
proximity_index
.
items
():
indexnya
=
(
key
,
value
)
context
=
{
"indexnya"
:
indexnya
}
import
json
indexnya
=
json
.
loads
(
json
.
dumps
(
proximity_index
))
words
=
indexnya
.
keys
()
freq
=
indexnya
.
values
()
context
=
{
"words"
:
words
,
"freq"
:
freq
}
return
render
(
request
,
'apps/indexing.html'
,
context
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment