diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4df0d2172373e276af1cce197c47b7815d1c2d05..0b1ba83ce394716bcf8f2d0b7e64f950f810e183 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,9 +15,11 @@ lipn-search: - echo "Début de la phase de test" - echo "Début des tests sur l'index inversé" - cd index_inverse/test/ + - pytest test_IndexInverse.py - pytest test_tf_idf.py - pytest test_bm25.py - - pytest test_IndexInverse.py + - pytest test_sort_index.py + - pytest test_recherche.py - cd ../../ - echo "Fin des tests sur l'index inversé" - echo "Début des tests sur le pagerank" diff --git a/index_inverse/src/IndexInverse.py b/index_inverse/src/IndexInverse.py index d1bb2b251b28dbd8891910f748fe854b20f5bdaa..3a8b1fc5d45c299decbe893aebd5fc75238e125d 100644 --- a/index_inverse/src/IndexInverse.py +++ b/index_inverse/src/IndexInverse.py @@ -14,21 +14,21 @@ class IndexInverse : - vocabulary: C'est un dictionnaire qui contient des mots apparus dans tous les documents. Chaque mot a son ID. ex:{'I':0,'am':1} (Même ordre avec les mots qui sont dans invert_index ) """ - - docuCount=0 #Le nombre de documents dans la classe - vocaCount=0 #Le nombre de mots dans le dictionnaire vocabulary + def __init__(self): self.document_id = dict() - self.docs_content = dict() + self.docs_content = [] self.invert_index = dict() self.vocabulary = dict() + self.docuCount=0 #Le nombre de documents dans la classe + self.vocaCount=0 #Le nombre de mots dans le dictionnaire vocabulary def ajout_url(self, url , contenu): - self.document_id.setdefault(IndexInverse.docuCount,url) - doc= contenu.lower().split() - self.docs_content.setdefault(IndexInverse.docuCount,doc) - IndexInverse.docuCount += 1 + self.document_id.setdefault(self.docuCount,url) + doc= contenu.lower().replace(",", "").replace("/n", "").split() + self.docs_content.append(doc) + self.docuCount = self.docuCount+1 def create_index (self): ''' Créer le dictionnaire invert_index. @@ -36,19 +36,18 @@ class IndexInverse : La valeur correspondant à la clé est une liste, qui enregistre les documents (document_id) dans lesquels le mot apparaît. ''' a=0 - for doc_id in range(IndexInverse.docuCount): - for word in self.docs_content[doc_id]: + for doc_id,article in zip(range(self.docuCount),self.docs_content): + for word in article: if word in self.invert_index: if not doc_id in self.invert_index[word]: self.invert_index.setdefault(word,[]).append(doc_id) else: self.vocabulary[word]=a a=a+1 - self.invert_index.setdefault(word,[]).append(doc_id) - IndexInverse.vocaCount=len(self.vocabulary) - - - + self.invert_index.setdefault(word,[]).append(doc_id) + self.vocaCount=len(self.vocabulary) + + def get_index(self): '''Obtenir le dictionnaire invert_index. ''' @@ -67,12 +66,12 @@ class IndexInverse : def get_nb_documents(self): '''Renvoyer le nombre de documents dans ce classe ''' - return IndexInverse.docuCount + return self.docuCount def get_nb_vocabularys(self): '''Renvoyer le nombre de mots dans le dictionnaire vocabulary ''' - return IndexInverse.vocaCount + return self.vocaCount def get_vocabulary(self): '''Renvoyer le dictionnaire qui contient des mots @@ -92,7 +91,7 @@ class IndexInverse : def show_index(self): '''Imprimer le dictionnaire invert_index. ''' - print ('NbDocument='+str(IndexInverse.docuCount)) + print ('NbDocument='+str(self.docuCount)) for word, IDs in self.invert_index.items(): print('') print(word+':',end=' ') diff --git a/index_inverse/src/__pycache__/bm25.cpython-37.pyc b/index_inverse/src/__pycache__/bm25.cpython-37.pyc deleted file mode 100644 index 1276f9fc41f3a22ee1c084d42426e37b62322aff..0000000000000000000000000000000000000000 Binary files a/index_inverse/src/__pycache__/bm25.cpython-37.pyc and /dev/null differ diff --git a/index_inverse/src/bm25.py b/index_inverse/src/bm25.py index e5371e5fbb0d4cab2abf976a516d048954349e79..241639e67c21603fffe21607d6c525f0f561cb25 100755 --- a/index_inverse/src/bm25.py +++ b/index_inverse/src/bm25.py @@ -3,9 +3,9 @@ """ Created on Tue Mar 30 19:51:28 2021 -@author: jinlili -dm-25 + +bm-25 """ import numpy as np @@ -25,7 +25,7 @@ class bm25: # print((docuCount-df+0.5) / (df+0.5)) return idf_2 - def bm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75): + def bm_25 (nb_occ,docs_content,docuCount,idf_2,k=1.2 , b=0.75): ''' k1 ∈ [1.2, 2.0] (en pratique) b = 0.75 (en pratique) @@ -39,7 +39,13 @@ class bm25: d_longueur=np.append(d_longueur,len(d)) sum_d=np.sum(d_longueur) avg_d=sum_d/np.size(d_longueur) - bm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d)))) + bm_25=idf_2*(nb_occ*(k+1)/(nb_occ+k*(1-b+b*(d_longueur/avg_d)))) return bm_25 + + + + + + diff --git a/index_inverse/src/charger_doc.py b/index_inverse/src/charger_doc.py new file mode 100755 index 0000000000000000000000000000000000000000..b259deb1ed6704b79c5e411a4e5ab81283f1c77d --- /dev/null +++ b/index_inverse/src/charger_doc.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 5 22:58:23 2021 + + +""" + +import os +# path = "src/DATA" , un repertoire + +def charge_doc (path): + files= os.listdir(path) + s = [] + for file in files: + if not os.path.isdir(file): + f = open(path+"/"+file,encoding="utf-8"); + iter_f = iter(f); + str = "" + for line in iter_f: + str = str + line + s.append(str) + f.close() + + return files,s # list_contenu et list_url + + +from bs4 import BeautifulSoup +from IndexInverse import IndexInverse +import io + +def chargerfichier(repertoire): + index = IndexInverse() + Path = os.listdir(repertoire) + for var in Path : + url = var + var = repertoire + var + try: + with io.open(var, encoding="UTF-8", errors= 'replace' ,mode="r") as fp : + soup = BeautifulSoup(fp, features="html5lib") + contenu = str(soup.get_text()) + index.ajout_url(url, contenu) + inv = index.create_index() + except: + FileNotFoundError + fp.close() + return inv diff --git a/index_inverse/src/recherche.py b/index_inverse/src/recherche.py new file mode 100755 index 0000000000000000000000000000000000000000..8bcaaf443481a2e1b357a9dced0dace31eeacf29 --- /dev/null +++ b/index_inverse/src/recherche.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 13 00:37:22 2021 + + +""" +import numpy as np + + +''' + -- sequence_de_mot_cle: une list de string + -- index_inverse: un object qui été fait avec la class IndexInverse + -- score: on a obtenu avec la fonction sort_index +''' +def recherche (sequence_de_mot_cle, index_inverse, score): + n=len(sequence_de_mot_cle) + if n==0: + return -1; + elif n==1: # si li y a que un mot cle + docs_id=index_inverse.get_index()[sequence_de_mot_cle[0]][:10] + docs=[] + for i in range(len(docs_id)): + docs.append(index_inverse.get_document_id()[docs_id[i]]) + return docs + else: + vocabulary=[] + score_mots=np.zeros(index_inverse.get_nb_documents()) + for j in range (n): + if sequence_de_mot_cle[j].lower() in index_inverse.get_vocabulary().keys(): + vocabulary.append(index_inverse.get_vocabulary()[sequence_de_mot_cle[j]]) + for m in range (len(vocabulary)): + temp=score[vocabulary[m],:] + score_mots=score_mots+temp + sort_score_mots=np.argsort(-score_mots) + docs_id=sort_score_mots[:10] + docs=[] + for i in range(len(docs_id)): + docs.append(index_inverse.get_document_id()[docs_id[i]]) + return docs + +def recherche2(self, liste_mot, matrice): + liste_score = dict() + for docid in range(self.docuCount): + score = 0 + for word in liste_mot: + if word in self.vocabulary: + score = matrice[self.vocabulary[word]][docid] + liste_score.setdefault(self.document_id[docid],score) + liste_score[self.document_id[0]] = 1 + liste_score = sorted(liste_score.items(), key=lambda t: t[1]) + + if self.docuCount > 10 : + top10 = list() + for i in range(10): + m = self.docuCount - 1 - i + top10.append(liste_score[m][0]) + return top10 + + return liste_score diff --git a/index_inverse/src/sort_index.py b/index_inverse/src/sort_index.py new file mode 100755 index 0000000000000000000000000000000000000000..7098aca1c3173b5400a1a1c338914c72f0d726df --- /dev/null +++ b/index_inverse/src/sort_index.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 12 20:07:56 2021 + + +""" + +import numpy as np +from tf_idf import tf_idf +from bm25 import bm25 + +def sort_index (IndexInverse, standard="tf_classique/idf"): + # tf_idf=TF * IDF + # matrice_tf: [n_vocab, n_doc] + # matrice_idf: [n_vocab, 1] + # matrice_score:[n_vocab, n_doc] + if (standard=="tf_classique/idf"): + tf_matrix=tf_idf.tf_classique(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + score = tf_matrix * idf + elif (standard=="tf_normalisation_log/idf"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + tf_normalisation_log=tf_idf.tf_normalisation_log(nb_occ) + score = tf_normalisation_log * idf + elif (standard=="tf_normalisation_max/idf"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + tf_normalisation_max=tf_idf.tf_normalisation_max(nb_occ) + score = tf_normalisation_max * idf + elif (standard=="bm_25"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf_2=bm25.idf_2(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + score=bm25.bm_25(nb_occ,IndexInverse.get_docs_content(),IndexInverse.get_nb_documents(),idf_2) + else: + print("Please enter the correct sort method. ") + return -1 + + sort_score=np.argsort(-score, axis=1) + for i,(word,docs) in zip (range(IndexInverse.get_nb_vocabularys()), IndexInverse.get_index().items()): + n=len(docs) + li=sort_score[i,:n] + IndexInverse.get_index()[word]=li.tolist() + return IndexInverse,score + + + + + + + + + + + diff --git a/index_inverse/src/tf_idf.py b/index_inverse/src/tf_idf.py index d39a86f718b6929403422428ced1b9a94063cb1d..d47da85063c6b2f651a2c0a8a507957ff7dea2de 100755 --- a/index_inverse/src/tf_idf.py +++ b/index_inverse/src/tf_idf.py @@ -11,30 +11,41 @@ import numpy as np from collections import Counter class tf_idf: - + # matrice_tf: [n_vocab, n_doc] + # matrice_idf: [n_vocab, 1] + + def nb_occ (docs_content,vocabulary,docuCount): + ''' nombre d’occurrences de t dans d ''' + tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc] + for id, doc in zip(range(docuCount),docs_content): + counter = Counter(doc) + for v in counter.keys(): + tf_matrix[vocabulary[v],id] = counter[v] + return tf_matrix + def tf_classique (docs_content,vocabulary,docuCount): ''' nombre d’occurrences de t dans d /nombre de mots dans d''' tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc] for id, doc in zip(range(docuCount),docs_content): counter = Counter(doc) for v in counter.keys(): - tf_matrix[vocabulary[v],id] = counter[v] / counter.most_common(1)[0][1] + tf_matrix[vocabulary[v],id] = counter[v] / len(doc) return tf_matrix - - def tf_normalisation_log (tf_matrix_classique): + + def tf_normalisation_log (nb_occ): '''Normalisation Logarithmique: 1 + log ft ,d ''' - tf_matrix_log=np.log(tf_matrix_classique)+1 + tf_matrix_log=np.log(nb_occ)+1 return tf_matrix_log - def tf_normalisation_max (tf_matrix_classique): + def tf_normalisation_max (nb_occ): ''' Normalisationpar le max : 0.5 + 0.5 × ft,d/maxt′∈d ft′,d ''' - tf_matrix_max= 0.5 + 0.5 * (tf_matrix_classique / np.max(tf_matrix_classique, axis=1, keepdims=True)) + tf_matrix_max= 0.5 + 0.5 * (nb_occ / np.max(nb_occ, axis=1, keepdims=True)) return tf_matrix_max def idf (invert_index,vocabulary,vocaCount,docuCount): '''Inverse Document Frequency (IDF) - Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d}) - ''' + Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d}) + ''' df = np.zeros((vocaCount, 1)) for (key,val) in vocabulary.items(): index_val=invert_index.get(key) @@ -47,3 +58,4 @@ class tf_idf: + diff --git a/index_inverse/test/test_IndexInverse.py b/index_inverse/test/test_IndexInverse.py index 57c537a9511fbac20569f01148faa90768942942..fcd1502c1c8d3ac74cc11116c705c84a853b449f 100644 --- a/index_inverse/test/test_IndexInverse.py +++ b/index_inverse/test/test_IndexInverse.py @@ -1,50 +1,83 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sat Mar 13 18:41:56 2021 - - -""" -import sys -sys.path.append('../src') -from IndexInverse import IndexInverse -from unittest import TestCase - -class TestIndexInverse(TestCase): - index = IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - def test_IndexInverse_1(self): - result = TestIndexInverse.index.get_docs_with_keyword('i') - self.assertEqual(result, [0,1,2]) - - def test_IndexInverse_2(self): - result = TestIndexInverse.index.get_docs_with_keyword('like') - self.assertEqual(result, -1) - - def test_IndexInverse_3(self): - result = TestIndexInverse.index.get_nb_documents() - self.assertEqual(result, 3) - - def test_IndexInverse_4(self): - result = TestIndexInverse.index.get_nb_vocabularys() - self.assertEqual(result, 15) - - def test_IndexInverse_5(self): - result = TestIndexInverse.index.get_vocabulary() - self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, - 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) - - def test_IndexInverse_6(self): - result = TestIndexInverse.index.get_document_id() - self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) - - def test_IndexInverse_7(self): - result = TestIndexInverse.index.get_docs_content() - self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) - - - - + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 13 18:41:56 2021 + + +""" +import sys +sys.path.append('../src') +from IndexInverse import IndexInverse +import unittest + +class TestIndexInverse(unittest.TestCase): + def test_IndexInverse_1(self): + index1=IndexInverse() + index1.ajout_url('lipn.fr_1', 'I love shanghai') + index1.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index1.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index1.create_index() + result = index1.get_nb_documents() + self.assertEqual(result, 3) + + def test_IndexInverse_2(self): + index2=IndexInverse() + index2.ajout_url('lipn.fr_1', 'I love shanghai') + index2.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index2.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index2.create_index() + result = index2.get_docs_with_keyword('like') + self.assertEqual(result, -1) + + def test_IndexInverse_3(self): + index3=IndexInverse() + index3.ajout_url('lipn.fr_1', 'I love shanghai') + index3.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index3.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index3.create_index() + result = index3.get_docs_with_keyword('i') + self.assertEqual(result, [0,1,2]) + + def test_IndexInverse_4(self): + index4=IndexInverse() + index4.ajout_url('lipn.fr_1', 'I love shanghai') + index4.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index4.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index4.create_index() + result = index4.get_nb_vocabularys() + self.assertEqual(result, 15) + + def test_IndexInverse_5(self): + index5=IndexInverse() + index5.ajout_url('lipn.fr_1', 'I love shanghai') + index5.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index5.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index5.create_index() + result = index5.get_vocabulary() + self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) + + def test_IndexInverse_6(self): + index6=IndexInverse() + index6.ajout_url('lipn.fr_1', 'I love shanghai') + index6.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index6.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index6.create_index() + result = index6.get_document_id() + self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) + + def test_IndexInverse_7(self): + index7=IndexInverse() + index7.ajout_url('lipn.fr_1', 'I love shanghai') + index7.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index7.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index7.create_index() + result = index7.get_docs_content() + self.assertEqual(result, [['i', 'love', 'shanghai'], ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']]) + + + +if __name__ == '__main__': + unittest.main() + diff --git a/index_inverse/test/test_bm25.py b/index_inverse/test/test_bm25.py index ab21bb2e1ed6b568331b13f648b771e26e5cf9e2..a093831f4ab9a2f91b9b9e8408863431f802e75c 100755 --- a/index_inverse/test/test_bm25.py +++ b/index_inverse/test/test_bm25.py @@ -3,7 +3,7 @@ """ Created on Wed Mar 31 00:08:41 2021 -@author: jinlili + """ @@ -47,21 +47,21 @@ class TestDM25(unittest.TestCase): [ 0.51082562]]) ) def test_BM5_2(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']] @@ -81,22 +81,22 @@ class TestDM25(unittest.TestCase): [ 0.51082562], [ 0.51082562], [ 0.51082562]]) - b25=bm25.bm_25 (tf_matrix_classique,docs_content,docuCount,i2) - np.testing.assert_allclose( b25, np.array([[-2.67562645, -1.86130536, -1.58555642], - [ 0.70238523, 0. , 0. ], - [-0.70238523, -0.31217121, -0. ], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [ 0. , 0.31217121, 0. ], - [-0. , -0.31217121, -0.25541281], - [ 0. , 0. , 0.41622829], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281]]) ) + b25=bm25.bm_25 (nb_occ,docs_content,docuCount,i2) + np.testing.assert_allclose( b25, np.array([[-2.67562645, -2.59454687, -2.31405531], + [ 0.70238523, 0. , 0. ], + [-0.70238523, -0.48861581, -0. ], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0.48861581, 0. ], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0. , 0.60746831], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829]]) ) diff --git a/index_inverse/test/test_recherche.py b/index_inverse/test/test_recherche.py new file mode 100755 index 0000000000000000000000000000000000000000..e40f9608d5e9b4fde67bd0d4b80d3026f56a255e --- /dev/null +++ b/index_inverse/test/test_recherche.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 13 03:08:05 2021 + + +""" + +import sys +sys.path.append('../src') +from IndexInverse import IndexInverse +from sort_index import sort_index +from recherche import recherche +import unittest + +class TestRecherche(unittest.TestCase): + def test_Recherche_1(self): + # recherche le mot "i" + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche("i",index,score),['lipn.fr_1', 'lipn.fr_2', 'lipn.fr_3'] ) + + def test_Recherche_2(self): + # recherche mots cles ["i","am"] + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche(["i","am"],index,score),['lipn.fr_1', 'lipn.fr_2', 'lipn.fr_3'] ) + + def test_Recherche_3(self): + # recherche mots cles ["i","am","from"] + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche(["i","am","from"],index,score), ['lipn.fr_2', 'lipn.fr_1', 'lipn.fr_3'] ) + + +if __name__ == '__main__': + unittest.main() + diff --git a/index_inverse/test/test_sort_index.py b/index_inverse/test/test_sort_index.py new file mode 100755 index 0000000000000000000000000000000000000000..501d1546e0b72972729747eea36f893881bdfeb3 --- /dev/null +++ b/index_inverse/test/test_sort_index.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 13 02:43:25 2021 + + +""" + +import numpy as np +import sys +sys.path.append('../src') +from IndexInverse import IndexInverse +from sort_index import sort_index +from numpy import inf +import unittest + +class SaneEqualityArray(np.ndarray): + def __eq__(self, other): + return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all()) + +class TestSortIndex(unittest.TestCase): + def test_SortIndex_1(self): + # Faire en sortes avec le score "tf_classique/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index) + self.assertEqual(index.get_index(), {'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[0.48877902, 0.29326741, 0.20947672], + [0.5364793 , 0. , 0. ], + [0.50135913, 0.15040774, 0. ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.16094379, 0. ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0. , 0.2299197 ], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985]]) ) + + + def test_SortIndex_2(self): + # Faire en sortes avec le score "tf_normalisation_log/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"tf_normalisation_log/idf") + self.assertEqual(index.get_index(), {'i': [1, 2, 0], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[1.46633707, 2.48272447, 2.48272447], + [1.60943791, -inf, -inf], + [1.5040774 , 1.5040774 , -inf], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.60943791, -inf], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, -inf, 2.72501526], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791]])) + + def test_SortIndex_3(self): + # Faire en sortes avec le score "tf_normalisation_max/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"tf_normalisation_max/idf") + self.assertEqual(index.get_index(), {'i': [1, 2, 0], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[1.0997528 , 1.46633707, 1.46633707], + [1.60943791, 0.80471896, 0.80471896], + [1.5040774 , 1.5040774 , 0.7520387 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.80471896, 1.60943791, 0.80471896], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791]])) + + + def test_SortIndex_4(self): + # Faire en sortes avec le score "bm_25" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"bm_25") + self.assertEqual(index.get_index(), {'i': [2, 1, 0], 'love': [0], 'shanghai': [2, 1], 'am': [0, 2], 'from': [0, 2], + 'now': [0, 2], 'study': [0, 2], 'in': [0, 2], 'tongji': [1], 'university': [0, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[-2.67562645, -2.59454687, -2.31405531], + [ 0.70238523, 0. , 0. ], + [-0.70238523, -0.48861581, -0. ], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0.48861581, 0. ], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0. , 0.60746831], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829]])) + + + +if __name__ == '__main__': + unittest.main() + diff --git a/index_inverse/test/test_tf_idf.py b/index_inverse/test/test_tf_idf.py index e314fae67cc0230109d0509c1dba035218627c9b..5e2b8e4bce891367f49e4d01e71b9f1e98381d0a 100755 --- a/index_inverse/test/test_tf_idf.py +++ b/index_inverse/test/test_tf_idf.py @@ -19,6 +19,7 @@ class SaneEqualityArray(np.ndarray): class TestTfIdf(unittest.TestCase): def test_TfIdf_1(self): + # tf_classique docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']] @@ -26,90 +27,93 @@ class TestTfIdf(unittest.TestCase): 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} docuCount=3 tf_matrix_classique=tf_idf.tf_classique (docs_content,vocabulary,docuCount) - np.testing.assert_array_equal(tf_matrix_classique, np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]])) + np.testing.assert_allclose(tf_matrix_classique, np.array([[0.33333333, 0.2 , 0.14285714], + [0.33333333, 0. , 0. ], + [0.33333333, 0.1 , 0. ], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0. ], + [0. , 0.1 , 0.07142857], + [0. , 0. , 0.14285714], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857]]) ) - def test_TfIdf_2(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) - mat_nor_log=tf_idf.tf_normalisation_log (tf_matrix_classique) - np.testing.assert_allclose( mat_nor_log,np.array([[1. , 1. , 1. ], - [1. , -inf, -inf], - [1. , 0.30685282, -inf], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, -inf], - [ -inf, 0.30685282, 0.30685282], - [ -inf, -inf, 1. ], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282]]) ) + def test_TfIdf_2(self): + # tf_normalisation_log + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) + mat_nor_log=tf_idf.tf_normalisation_log (nb_occ) + np.testing.assert_allclose( mat_nor_log, np.array([[1. , 1.69314718, 1.69314718], + [1. , -inf, -inf], + [1. , 1. , -inf], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , -inf], + [ -inf, 1. , 1. ], + [ -inf, -inf, 1.69314718], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ]]) ) def test_TfIdf_3(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) - mat_nor_max=tf_idf.tf_normalisation_max (tf_matrix_classique) - np.testing.assert_array_equal( mat_nor_max,np.array([[1. , 1. , 1. ], - [1. , 0.5 , 0.5 ], - [1. , 0.75, 0.5 ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 0.5 ], - [0.5 , 1. , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ]]) ) + # tf_normalisation_max + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) + mat_nor_max=tf_idf.tf_normalisation_max (nb_occ) + np.testing.assert_array_equal( mat_nor_max, np.array([[0.75, 1. , 1. ], + [1. , 0.5 , 0.5 ], + [1. , 1. , 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ]])) def test_TfIdf_4(self): + # idf invert_index={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} @@ -117,20 +121,20 @@ class TestTfIdf(unittest.TestCase): docuCount=3 mat_idf=tf_idf.idf(invert_index,vocabulary,vocaCount,docuCount) np.testing.assert_allclose(mat_idf,np.array([[1.46633707], - [1.60943791], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.60943791], - [1.5040774 ], - [1.60943791], - [1.60943791], - [1.60943791], - [1.60943791], - [1.60943791]]) ) + [1.60943791], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.60943791], + [1.5040774 ], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791]]) ) if __name__ == '__main__': diff --git a/pagerank/.DS_Store b/pagerank/.DS_Store deleted file mode 100644 index 95b5f197a236ab1803b61635259f1c29c270a943..0000000000000000000000000000000000000000 Binary files a/pagerank/.DS_Store and /dev/null differ