diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6bcba9cb147ffb6a5cd01ac186598b10575c182b..4df0d2172373e276af1cce197c47b7815d1c2d05 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,12 +15,16 @@ lipn-search: - echo "Début de la phase de test" - echo "Début des tests sur l'index inversé" - cd index_inverse/test/ - - pytest test_hamming.py + - pytest test_tf_idf.py + - pytest test_bm25.py + - pytest test_IndexInverse.py - cd ../../ - echo "Fin des tests sur l'index inversé" - echo "Début des tests sur le pagerank" - cd pagerank/test/ - echo "Faire des trucs" - cd ../../ - - echo "Début des tests sur le pagerank" - - echo "Fin de la phase de test" \ No newline at end of file + - echo "Début des tests sur le pagerank" + - echo "Fin de la phase de test" + + diff --git a/index_inverse/src/IndexInverse.py b/index_inverse/src/IndexInverse.py old mode 100755 new mode 100644 index d1f858357a569a67a46281221824510c9ddad26a..d1bb2b251b28dbd8891910f748fe854b20f5bdaa --- a/index_inverse/src/IndexInverse.py +++ b/index_inverse/src/IndexInverse.py @@ -17,22 +17,18 @@ class IndexInverse : docuCount=0 #Le nombre de documents dans la classe vocaCount=0 #Le nombre de mots dans le dictionnaire vocabulary - def __init__(self,list_urls, list_content): - '''Constructeur IndexInverse''' - if len(list_urls)==len(list_content): #données valides - self.document_id = dict(zip(range(len(list_urls)), list_urls)) - doc= [c.lower().replace(",", "").replace("/n", "").split(" ") for c in list_content] - for d in doc: - while '' in d: - d.remove('') - self.docs_content=doc - self.invert_index=dict() - self.vocabulary=dict() - IndexInverse.docuCount= len(list_urls) - else: - #erreur: la taille de list_url faut egale à la taille de list_contenu - raise Exception("Invalid parameter!") + def __init__(self): + self.document_id = dict() + self.docs_content = dict() + self.invert_index = dict() + self.vocabulary = dict() + + def ajout_url(self, url , contenu): + self.document_id.setdefault(IndexInverse.docuCount,url) + doc= contenu.lower().split() + self.docs_content.setdefault(IndexInverse.docuCount,doc) + IndexInverse.docuCount += 1 def create_index (self): ''' Créer le dictionnaire invert_index. @@ -40,8 +36,8 @@ class IndexInverse : La valeur correspondant à la clé est une liste, qui enregistre les documents (document_id) dans lesquels le mot apparaît. ''' a=0 - for doc_id,article in zip(range(IndexInverse.docuCount),self.docs_content): - for word in article: + for doc_id in range(IndexInverse.docuCount): + for word in self.docs_content[doc_id]: if word in self.invert_index: if not doc_id in self.invert_index[word]: self.invert_index.setdefault(word,[]).append(doc_id) @@ -50,10 +46,9 @@ class IndexInverse : a=a+1 self.invert_index.setdefault(word,[]).append(doc_id) IndexInverse.vocaCount=len(self.vocabulary) - print(IndexInverse.vocaCount) - + def get_index(self): '''Obtenir le dictionnaire invert_index. ''' @@ -106,5 +101,3 @@ class IndexInverse : return - - diff --git a/index_inverse/src/__pycache__/bm25.cpython-37.pyc b/index_inverse/src/__pycache__/bm25.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1276f9fc41f3a22ee1c084d42426e37b62322aff Binary files /dev/null and b/index_inverse/src/__pycache__/bm25.cpython-37.pyc differ diff --git a/index_inverse/src/bm25.py b/index_inverse/src/bm25.py new file mode 100755 index 0000000000000000000000000000000000000000..e5371e5fbb0d4cab2abf976a516d048954349e79 --- /dev/null +++ b/index_inverse/src/bm25.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 30 19:51:28 2021 + +@author: jinlili + +dm-25 +""" + +import numpy as np + +class bm25: + + def idf_2 (invert_index,vocabulary,vocaCount,docuCount): + '''Inverse Document Frequency (IDF) + IDF(t) = log(|D|−|Dt|+0.5/ |Dt|+0.5 ) + ''' + df = np.zeros((vocaCount, 1)) + for (key,val) in vocabulary.items(): + index_val=invert_index.get(key) + if index_val !=None: + df[val,0] = len(index_val) + idf_2=np.log((docuCount-df+0.5) / (df+0.5)) + # print((docuCount-df+0.5) / (df+0.5)) + return idf_2 + + def bm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75): + ''' + k1 ∈ [1.2, 2.0] (en pratique) + b = 0.75 (en pratique) + |d| est la longueur du document + avgdl : Average Document Length, la longueur moyenne des documents de la collection + BM-25t,d =IDF(t)∗( ft,d ×(k1 +1) / ft,d + k1 × ( 1−b+b× (|d|/avgdl) ) ) + + ''' + d_longueur=np.array([]) + for d in docs_content: + d_longueur=np.append(d_longueur,len(d)) + sum_d=np.sum(d_longueur) + avg_d=sum_d/np.size(d_longueur) + bm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d)))) + return bm_25 + + diff --git a/index_inverse/src/tf_idf.py b/index_inverse/src/tf_idf.py new file mode 100755 index 0000000000000000000000000000000000000000..d39a86f718b6929403422428ced1b9a94063cb1d --- /dev/null +++ b/index_inverse/src/tf_idf.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 30 02:16:23 2021 + + +TF-IDE +""" + +import numpy as np +from collections import Counter + +class tf_idf: + + def tf_classique (docs_content,vocabulary,docuCount): + ''' nombre d’occurrences de t dans d /nombre de mots dans d''' + tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc] + for id, doc in zip(range(docuCount),docs_content): + counter = Counter(doc) + for v in counter.keys(): + tf_matrix[vocabulary[v],id] = counter[v] / counter.most_common(1)[0][1] + return tf_matrix + + def tf_normalisation_log (tf_matrix_classique): + '''Normalisation Logarithmique: 1 + log ft ,d ''' + tf_matrix_log=np.log(tf_matrix_classique)+1 + return tf_matrix_log + + def tf_normalisation_max (tf_matrix_classique): + ''' Normalisationpar le max : 0.5 + 0.5 × ft,d/maxt′∈d ft′,d ''' + tf_matrix_max= 0.5 + 0.5 * (tf_matrix_classique / np.max(tf_matrix_classique, axis=1, keepdims=True)) + return tf_matrix_max + + def idf (invert_index,vocabulary,vocaCount,docuCount): + '''Inverse Document Frequency (IDF) + Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d}) + ''' + df = np.zeros((vocaCount, 1)) + for (key,val) in vocabulary.items(): + index_val=invert_index.get(key) + if index_val !=None: + df[val,0] = len(index_val) + idf=np.log(docuCount+1/df+1) #Le dénominateur ne peut pas être zéro + return idf + + + + + diff --git a/index_inverse/test/test_IndexInverse.py b/index_inverse/test/test_IndexInverse.py old mode 100755 new mode 100644 index 0ff3857ae528d66ba4d03c86fdd4c59e3a2696a0..57c537a9511fbac20569f01148faa90768942942 --- a/index_inverse/test/test_IndexInverse.py +++ b/index_inverse/test/test_IndexInverse.py @@ -8,83 +8,43 @@ Created on Sat Mar 13 18:41:56 2021 import sys sys.path.append('../src') from IndexInverse import IndexInverse -import unittest - -class TestIndexInverse(unittest.TestCase): +from unittest import TestCase + +class TestIndexInverse(TestCase): + index = IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() def test_IndexInverse_1(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_docs_with_keyword('i') - self.assertEqual(result,[0,1,2]) + result = TestIndexInverse.index.get_docs_with_keyword('i') + self.assertEqual(result, [0,1,2]) def test_IndexInverse_2(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_docs_with_keyword('like') - self.assertEqual(result,-1) + result = TestIndexInverse.index.get_docs_with_keyword('like') + self.assertEqual(result, -1) def test_IndexInverse_3(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_nb_documents() - self.assertEqual(result,3) + result = TestIndexInverse.index.get_nb_documents() + self.assertEqual(result, 3) def test_IndexInverse_4(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_nb_vocabularys() - self.assertEqual(result,15) + result = TestIndexInverse.index.get_nb_vocabularys() + self.assertEqual(result, 15) def test_IndexInverse_5(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_vocabulary() - self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + result = TestIndexInverse.index.get_vocabulary() + self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) def test_IndexInverse_6(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_document_id() - self.assertEqual(result,{0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) + result = TestIndexInverse.index.get_document_id() + self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) def test_IndexInverse_7(self): - doc=['lipn.fr_1','lipn.fr_2','lipn.fr_3'] - textes=['I love shanghai','i am from shanghai now i study in tongji university', - 'i am from lanzhou now i study in lanzhou university of science and technolgy'] - index=IndexInverse(doc,textes) - index.create_index() - result=index.get_docs_content() - self.assertEqual(result,[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', - 'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', - 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']]) + result = TestIndexInverse.index.get_docs_content() + self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) + -if __name__ == '__main__': - unittest.main() - - - - - - - \ No newline at end of file diff --git a/index_inverse/test/test_bm25.py b/index_inverse/test/test_bm25.py new file mode 100755 index 0000000000000000000000000000000000000000..ab21bb2e1ed6b568331b13f648b771e26e5cf9e2 --- /dev/null +++ b/index_inverse/test/test_bm25.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 31 00:08:41 2021 + +@author: jinlili +""" + + +import numpy as np +from numpy import inf +import sys +sys.path.append('../src') +from bm25 import bm25 +import unittest + + +class SaneEqualityArray(np.ndarray): + def __eq__(self, other): + return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all()) + + +class TestDM25(unittest.TestCase): + def test_BM25_1(self): + invert_index ={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], + 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], + 'science': [2], 'and': [2], 'technolgy': [2]} + vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} + docuCount=3 + vocaCount=15 + i2=bm25.idf_2 (invert_index,vocabulary,vocaCount,docuCount) + np.testing.assert_allclose( i2,np.array([[-1.94591015], + [ 0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [ 0.51082562], + [-0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562]]) ) + + def test_BM5_2(self): + tf_matrix_classique=np.array([[1. , 1. , 1. ], + [1. , 0. , 0. ], + [1. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0. , 1. ], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5]]) + docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', + 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', + 'university', 'of', 'science', 'and', 'technolgy']] + docuCount=3 + i2=np.array([[-1.94591015], + [ 0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [-0.51082562], + [ 0.51082562], + [-0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562], + [ 0.51082562]]) + b25=bm25.bm_25 (tf_matrix_classique,docs_content,docuCount,i2) + np.testing.assert_allclose( b25, np.array([[-2.67562645, -1.86130536, -1.58555642], + [ 0.70238523, 0. , 0. ], + [-0.70238523, -0.31217121, -0. ], + [-0. , -0.31217121, -0.25541281], + [-0. , -0.31217121, -0.25541281], + [-0. , -0.31217121, -0.25541281], + [-0. , -0.31217121, -0.25541281], + [-0. , -0.31217121, -0.25541281], + [ 0. , 0.31217121, 0. ], + [-0. , -0.31217121, -0.25541281], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.25541281], + [ 0. , 0. , 0.25541281], + [ 0. , 0. , 0.25541281], + [ 0. , 0. , 0.25541281]]) ) + + + + +if __name__ == '__main__': + unittest.main() + + + + + + + + + + + + + diff --git a/index_inverse/test/test_tf_idf.py b/index_inverse/test/test_tf_idf.py new file mode 100755 index 0000000000000000000000000000000000000000..e314fae67cc0230109d0509c1dba035218627c9b --- /dev/null +++ b/index_inverse/test/test_tf_idf.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 30 22:52:41 2021 + + +""" + +import numpy as np +from numpy import inf +import sys +sys.path.append('../src') +from tf_idf import tf_idf +import unittest + +class SaneEqualityArray(np.ndarray): + def __eq__(self, other): + return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all()) + +class TestTfIdf(unittest.TestCase): + def test_TfIdf_1(self): + docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', + 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', + 'university', 'of', 'science', 'and', 'technolgy']] + vocabulary= {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} + docuCount=3 + tf_matrix_classique=tf_idf.tf_classique (docs_content,vocabulary,docuCount) + np.testing.assert_array_equal(tf_matrix_classique, np.array([[1. , 1. , 1. ], + [1. , 0. , 0. ], + [1. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0. , 1. ], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5]])) + + def test_TfIdf_2(self): + tf_matrix_classique=np.array([[1. , 1. , 1. ], + [1. , 0. , 0. ], + [1. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0. , 1. ], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5]]) + mat_nor_log=tf_idf.tf_normalisation_log (tf_matrix_classique) + np.testing.assert_allclose( mat_nor_log,np.array([[1. , 1. , 1. ], + [1. , -inf, -inf], + [1. , 0.30685282, -inf], + [ -inf, 0.30685282, 0.30685282], + [ -inf, 0.30685282, 0.30685282], + [ -inf, 0.30685282, 0.30685282], + [ -inf, 0.30685282, 0.30685282], + [ -inf, 0.30685282, 0.30685282], + [ -inf, 0.30685282, -inf], + [ -inf, 0.30685282, 0.30685282], + [ -inf, -inf, 1. ], + [ -inf, -inf, 0.30685282], + [ -inf, -inf, 0.30685282], + [ -inf, -inf, 0.30685282], + [ -inf, -inf, 0.30685282]]) ) + + def test_TfIdf_3(self): + tf_matrix_classique=np.array([[1. , 1. , 1. ], + [1. , 0. , 0. ], + [1. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0.5], + [0. , 0.5, 0. ], + [0. , 0.5, 0.5], + [0. , 0. , 1. ], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5], + [0. , 0. , 0.5]]) + mat_nor_max=tf_idf.tf_normalisation_max (tf_matrix_classique) + np.testing.assert_array_equal( mat_nor_max,np.array([[1. , 1. , 1. ], + [1. , 0.5 , 0.5 ], + [1. , 0.75, 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ]]) ) + + + def test_TfIdf_4(self): + invert_index={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} + vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} + vocaCount=15 + docuCount=3 + mat_idf=tf_idf.idf(invert_index,vocabulary,vocaCount,docuCount) + np.testing.assert_allclose(mat_idf,np.array([[1.46633707], + [1.60943791], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.60943791], + [1.5040774 ], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791]]) ) + + +if __name__ == '__main__': + unittest.main() + diff --git a/pagerank/.DS_Store b/pagerank/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..95b5f197a236ab1803b61635259f1c29c270a943 Binary files /dev/null and b/pagerank/.DS_Store differ