Commit f24c8ae6 by yesiscasihombing

.

parents
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
dcmnt_xml = minidom.parse("Dataset_Article.xml")
# In[2]:
all_doc_no = dcmnt_xml.getElementsByTagName('Id')
all_profile = dcmnt_xml.getElementsByTagName('title')
all_date = dcmnt_xml.getElementsByTagName('year')
all_text = dcmnt_xml.getElementsByTagName('content')
all_pub = dcmnt_xml.getElementsByTagName('author')
N_DOC_sample = len(all_doc_no)
# In[3]:
print(N_DOC_sample)
# In[4]:
all_sentence_doc_sample = []
for i in range(N_DOC_sample):
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
# In[5]:
all_sentence_doc_sample
# ## Preprocessing
# In[6]:
tokens_doc = []
# In[7]:
def remove_punc_tokenize(sentence):
tokens = []
for punctuation in string.punctuation:
sentence = sentence.replace(punctuation," ")
sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
for w in CountVectorizer().build_tokenizer()(sentence):
tokens.append(w)
return tokens
# In[8]:
for i in range(N_DOC_sample):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))
# In[9]:
tokens_doc
# In[10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
tokens = [w for w in tokens if not w in stop_words]
return tokens
for i in range(N_DOC_sample):
tokens_doc[i] = stop_word_token(tokens_doc[i])
# In[11]:
for i in range(N_DOC_sample):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
# In[12]:
tokens_doc
# In[13]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
for i in range(0, len(tokens)):
if (tokens[i] != stemmer.stem(tokens[i])):
tokens[i] = stemmer.stem(tokens[i])
return tokens
for i in range(N_DOC_sample):
tokens_doc[i] = stemming(tokens_doc[i])
# In[14]:
all_tokens = []
for i in range(N_DOC_sample):
for w in tokens_doc[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
# In[15]:
all_tokens
# In[16]:
from itertools import count
try:
from itertools import izip as zip
except ImportError:
pass
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC_sample):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
# In[17]:
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
print (key, value)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment