Commit 20fe104a authored by Julien David's avatar Julien David
Browse files

Merge branch 'Mehdi_Lili_Index_inverse' into 'master'

Mehdi lili index inverse

See merge request !19
parents ae33d959 367d197c
Pipeline #3162 passed with stage
in 36 seconds
......@@ -15,12 +15,16 @@ lipn-search:
- echo "Début de la phase de test"
- echo "Début des tests sur l'index inversé"
- cd index_inverse/test/
- pytest test_hamming.py
- pytest test_tf_idf.py
- pytest test_bm25.py
- pytest test_IndexInverse.py
- cd ../../
- echo "Fin des tests sur l'index inversé"
- echo "Début des tests sur le pagerank"
- cd pagerank/test/
- echo "Faire des trucs"
- cd ../../
- echo "Début des tests sur le pagerank"
- echo "Fin de la phase de test"
\ No newline at end of file
- echo "Début des tests sur le pagerank"
- echo "Fin de la phase de test"
......@@ -17,22 +17,18 @@ class IndexInverse :
docuCount=0 #Le nombre de documents dans la classe
vocaCount=0 #Le nombre de mots dans le dictionnaire vocabulary
def __init__(self,list_urls, list_content):
'''Constructeur IndexInverse'''
if len(list_urls)==len(list_content): #données valides
self.document_id = dict(zip(range(len(list_urls)), list_urls))
doc= [c.lower().replace(",", "").replace("/n", "").split(" ") for c in list_content]
for d in doc:
while '' in d:
d.remove('')
self.docs_content=doc
self.invert_index=dict()
self.vocabulary=dict()
IndexInverse.docuCount= len(list_urls)
else:
#erreur: la taille de list_url faut egale à la taille de list_contenu
raise Exception("Invalid parameter!")
def __init__(self):
self.document_id = dict()
self.docs_content = dict()
self.invert_index = dict()
self.vocabulary = dict()
def ajout_url(self, url , contenu):
self.document_id.setdefault(IndexInverse.docuCount,url)
doc= contenu.lower().split()
self.docs_content.setdefault(IndexInverse.docuCount,doc)
IndexInverse.docuCount += 1
def create_index (self):
''' Créer le dictionnaire invert_index.
......@@ -40,8 +36,8 @@ class IndexInverse :
La valeur correspondant à la clé est une liste, qui enregistre les documents (document_id) dans lesquels le mot apparaît.
'''
a=0
for doc_id,article in zip(range(IndexInverse.docuCount),self.docs_content):
for word in article:
for doc_id in range(IndexInverse.docuCount):
for word in self.docs_content[doc_id]:
if word in self.invert_index:
if not doc_id in self.invert_index[word]:
self.invert_index.setdefault(word,[]).append(doc_id)
......@@ -50,10 +46,9 @@ class IndexInverse :
a=a+1
self.invert_index.setdefault(word,[]).append(doc_id)
IndexInverse.vocaCount=len(self.vocabulary)
print(IndexInverse.vocaCount)
def get_index(self):
'''Obtenir le dictionnaire invert_index.
'''
......@@ -106,5 +101,3 @@ class IndexInverse :
return
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 30 19:51:28 2021
@author: jinlili
dm-25
"""
import numpy as np
class bm25:
def idf_2 (invert_index,vocabulary,vocaCount,docuCount):
'''Inverse Document Frequency (IDF)
IDF(t) = log(|D|−|Dt|+0.5/ |Dt|+0.5 )
'''
df = np.zeros((vocaCount, 1))
for (key,val) in vocabulary.items():
index_val=invert_index.get(key)
if index_val !=None:
df[val,0] = len(index_val)
idf_2=np.log((docuCount-df+0.5) / (df+0.5))
# print((docuCount-df+0.5) / (df+0.5))
return idf_2
def bm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75):
'''
k1 ∈ [1.2, 2.0] (en pratique)
b = 0.75 (en pratique)
|d| est la longueur du document
avgdl : Average Document Length, la longueur moyenne des documents de la collection
BM-25t,d =IDF(t)∗( ft,d ×(k1 +1) / ft,d + k1 × ( 1−b+b× (|d|/avgdl) ) )
'''
d_longueur=np.array([])
for d in docs_content:
d_longueur=np.append(d_longueur,len(d))
sum_d=np.sum(d_longueur)
avg_d=sum_d/np.size(d_longueur)
bm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d))))
return bm_25
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 30 02:16:23 2021
TF-IDE
"""
import numpy as np
from collections import Counter
class tf_idf:
def tf_classique (docs_content,vocabulary,docuCount):
''' nombre d’occurrences de t dans d /nombre de mots dans d'''
tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc]
for id, doc in zip(range(docuCount),docs_content):
counter = Counter(doc)
for v in counter.keys():
tf_matrix[vocabulary[v],id] = counter[v] / counter.most_common(1)[0][1]
return tf_matrix
def tf_normalisation_log (tf_matrix_classique):
'''Normalisation Logarithmique: 1 + log ft ,d '''
tf_matrix_log=np.log(tf_matrix_classique)+1
return tf_matrix_log
def tf_normalisation_max (tf_matrix_classique):
''' Normalisationpar le max : 0.5 + 0.5 × ft,d/maxt′∈d ft′,d '''
tf_matrix_max= 0.5 + 0.5 * (tf_matrix_classique / np.max(tf_matrix_classique, axis=1, keepdims=True))
return tf_matrix_max
def idf (invert_index,vocabulary,vocaCount,docuCount):
'''Inverse Document Frequency (IDF)
Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d})
'''
df = np.zeros((vocaCount, 1))
for (key,val) in vocabulary.items():
index_val=invert_index.get(key)
if index_val !=None:
df[val,0] = len(index_val)
idf=np.log(docuCount+1/df+1) #Le dénominateur ne peut pas être zéro
return idf
......@@ -8,83 +8,43 @@ Created on Sat Mar 13 18:41:56 2021
import sys
sys.path.append('../src')
from IndexInverse import IndexInverse
import unittest
class TestIndexInverse(unittest.TestCase):
from unittest import TestCase
class TestIndexInverse(TestCase):
index = IndexInverse()
index.ajout_url('lipn.fr_1', 'I love shanghai')
index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university')
index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy')
index.create_index()
def test_IndexInverse_1(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_docs_with_keyword('i')
self.assertEqual(result,[0,1,2])
result = TestIndexInverse.index.get_docs_with_keyword('i')
self.assertEqual(result, [0,1,2])
def test_IndexInverse_2(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_docs_with_keyword('like')
self.assertEqual(result,-1)
result = TestIndexInverse.index.get_docs_with_keyword('like')
self.assertEqual(result, -1)
def test_IndexInverse_3(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_nb_documents()
self.assertEqual(result,3)
result = TestIndexInverse.index.get_nb_documents()
self.assertEqual(result, 3)
def test_IndexInverse_4(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_nb_vocabularys()
self.assertEqual(result,15)
result = TestIndexInverse.index.get_nb_vocabularys()
self.assertEqual(result, 15)
def test_IndexInverse_5(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_vocabulary()
self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
result = TestIndexInverse.index.get_vocabulary()
self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14})
def test_IndexInverse_6(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_document_id()
self.assertEqual(result,{0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'})
result = TestIndexInverse.index.get_document_id()
self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'})
def test_IndexInverse_7(self):
doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3']
textes=['I love shanghai','i am from shanghai now i study in tongji university',
'i am from lanzhou now i study in lanzhou university of science and technolgy']
index=IndexInverse(doc,textes)
index.create_index()
result=index.get_docs_content()
self.assertEqual(result,[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i',
'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study',
'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']])
result = TestIndexInverse.index.get_docs_content()
self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']})
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 31 00:08:41 2021
@author: jinlili
"""
import numpy as np
from numpy import inf
import sys
sys.path.append('../src')
from bm25 import bm25
import unittest
class SaneEqualityArray(np.ndarray):
def __eq__(self, other):
return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all())
class TestDM25(unittest.TestCase):
def test_BM25_1(self):
invert_index ={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2],
'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2],
'science': [2], 'and': [2], 'technolgy': [2]}
vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}
docuCount=3
vocaCount=15
i2=bm25.idf_2 (invert_index,vocabulary,vocaCount,docuCount)
np.testing.assert_allclose( i2,np.array([[-1.94591015],
[ 0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[ 0.51082562],
[-0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562]]) )
def test_BM5_2(self):
tf_matrix_classique=np.array([[1. , 1. , 1. ],
[1. , 0. , 0. ],
[1. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0. , 1. ],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5]])
docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in',
'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou',
'university', 'of', 'science', 'and', 'technolgy']]
docuCount=3
i2=np.array([[-1.94591015],
[ 0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[ 0.51082562],
[-0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562]])
b25=bm25.bm_25 (tf_matrix_classique,docs_content,docuCount,i2)
np.testing.assert_allclose( b25, np.array([[-2.67562645, -1.86130536, -1.58555642],
[ 0.70238523, 0. , 0. ],
[-0.70238523, -0.31217121, -0. ],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[ 0. , 0.31217121, 0. ],
[-0. , -0.31217121, -0.25541281],
[ 0. , 0. , 0.41622829],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281]]) )
if __name__ == '__main__':
unittest.main()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 30 22:52:41 2021
"""
import numpy as np
from numpy import inf
import sys
sys.path.append('../src')
from tf_idf import tf_idf
import unittest
class SaneEqualityArray(np.ndarray):
def __eq__(self, other):
return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all())
class TestTfIdf(unittest.TestCase):
def test_TfIdf_1(self):
docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in',
'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou',
'university', 'of', 'science', 'and', 'technolgy']]
vocabulary= {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}
docuCount=3
tf_matrix_classique=tf_idf.tf_classique (docs_content,vocabulary,docuCount)
np.testing.assert_array_equal(tf_matrix_classique, np.array([[1. , 1. , 1. ],
[1. , 0. , 0. ],
[1. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0. , 1. ],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5]]))
def test_TfIdf_2(self):
tf_matrix_classique=np.array([[1. , 1. , 1. ],
[1. , 0. , 0. ],
[1. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0. , 1. ],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5]])
mat_nor_log=tf_idf.tf_normalisation_log (tf_matrix_classique)
np.testing.assert_allclose( mat_nor_log,np.array([[1. , 1. , 1. ],
[1. , -inf, -inf],
[1. , 0.30685282, -inf],
[ -inf, 0.30685282, 0.30685282],
[ -inf, 0.30685282, 0.30685282],
[ -inf, 0.30685282, 0.30685282],
[ -inf, 0.30685282, 0.30685282],
[ -inf, 0.30685282, 0.30685282],
[ -inf, 0.30685282, -inf],
[ -inf, 0.30685282, 0.30685282],
[ -inf, -inf, 1. ],
[ -inf, -inf, 0.30685282],
[ -inf, -inf, 0.30685282],
[ -inf, -inf, 0.30685282],
[ -inf, -inf, 0.30685282]]) )
def test_TfIdf_3(self):
tf_matrix_classique=np.array([[1. , 1. , 1. ],
[1. , 0. , 0. ],
[1. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0. , 1. ],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5]])
mat_nor_max=tf_idf.tf_normalisation_max (tf_matrix_classique)
np.testing.assert_array_equal( mat_nor_max,np.array([[1. , 1. , 1. ],
[1. , 0.5 , 0.5 ],
[1. , 0.75, 0.5 ],
[0.5 , 1. , 1. ],
[0.5 , 1. , 1. ],
[0.5 , 1. , 1. ],
[0.5 , 1. , 1. ],
[0.5 , 1. , 1. ],
[0.5 , 1. , 0.5 ],
[0.5 , 1. , 1. ],
[0.5 , 0.5 , 1. ],
[0.5 , 0.5 , 1. ],
[0.5 , 0.5 , 1. ],
[0.5 , 0.5 , 1. ],
[0.5 , 0.5 , 1. ]]) )
def test_TfIdf_4(self):
invert_index={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]}
vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}
vocaCount=15
docuCount=3
mat_idf=tf_idf.idf(invert_index,vocabulary,vocaCount,docuCount)
np.testing.assert_allclose(mat_idf,np.array([[1.46633707],
[1.60943791],
[1.5040774 ],
[1.5040774 ],
[1.5040774 ],
[1.5040774 ],
[1.5040774 ],
[1.5040774 ],
[1.60943791],
[1.5040774 ],
[1.60943791],
[1.60943791],
[1.60943791],
[1.60943791],
[1.60943791]]) )
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment