From a47318eee8fcffcc4c506d309f18cd3322352c38 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Mon, 12 Apr 2021 19:30:55 +0200 Subject: [PATCH 1/9] index --- index_inverse/src/bm25.py | 4 ++-- index_inverse/test/test_bm25.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index_inverse/src/bm25.py b/index_inverse/src/bm25.py index e5371e5..c2f0c5b 100755 --- a/index_inverse/src/bm25.py +++ b/index_inverse/src/bm25.py @@ -3,9 +3,9 @@ """ Created on Tue Mar 30 19:51:28 2021 -@author: jinlili -dm-25 + +bm-25 """ import numpy as np diff --git a/index_inverse/test/test_bm25.py b/index_inverse/test/test_bm25.py index ab21bb2..477c81d 100755 --- a/index_inverse/test/test_bm25.py +++ b/index_inverse/test/test_bm25.py @@ -3,7 +3,7 @@ """ Created on Wed Mar 31 00:08:41 2021 -@author: jinlili + """ -- GitLab From 6e9a146e24839b73b69349b6b6c722ad9ad7a604 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 03:33:04 +0200 Subject: [PATCH 2/9] sorte, charger et recherche --- .DS_Store | Bin 8196 -> 8196 bytes .gitlab-ci.yml | 4 +- index_inverse/.DS_Store | Bin 6148 -> 6148 bytes index_inverse/src/.DS_Store | Bin 6148 -> 6148 bytes index_inverse/src/IndexInverse.py | 35 ++-- .../src/__pycache__/bm25.cpython-37.pyc | Bin 1469 -> 0 bytes index_inverse/src/bm25.py | 10 +- index_inverse/src/tf_idf.py | 30 ++- index_inverse/test/test_IndexInverse.py | 92 ++++----- index_inverse/test/test_bm25.py | 62 +++--- index_inverse/test/test_tf_idf.py | 188 +++++++++--------- 11 files changed, 219 insertions(+), 202 deletions(-) delete mode 100644 index_inverse/src/__pycache__/bm25.cpython-37.pyc diff --git a/.DS_Store b/.DS_Store index aae1ba7d953b3f00bd24640d56934df4e4f845c3..463be7cef12b4ecac4903e25f6c5c87eaaf988f8 100644 GIT binary patch delta 260 zcmZp1XmQw}DiGHtdz^uRfrUYjA)O(Up(Hoo#U&{xKM5$tu};OircmgxBdUA~UipFy z!{Frn+ybB;28J~Po0|o0FfyK;93#}gCQ)5&Xl^$7ric>b%*lU50Kd+DTr)&S=06_v6F8JD^5Ns%)<)f=`$^004d$9DPqpZ!BpP9Jr?M|$qu4olj}se aSiu6DdqpdmHnU6oW|_<=8jo9184~~m(@)I+ delta 256 zcmZp1XmQw}DiGJX+?9cWfrUYjA)O(Up(Hoo#U&{xKM5$taq9gpZaJaDj;Qh}c;yQ+ z41<&Na|?ia7#Nma*xW2|gOTyXLj!}!_eEqGXH5PpBF{K;@;za> z$R diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4df0d21..0b1ba83 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,9 +15,11 @@ lipn-search: - echo "Début de la phase de test" - echo "Début des tests sur l'index inversé" - cd index_inverse/test/ + - pytest test_IndexInverse.py - pytest test_tf_idf.py - pytest test_bm25.py - - pytest test_IndexInverse.py + - pytest test_sort_index.py + - pytest test_recherche.py - cd ../../ - echo "Fin des tests sur l'index inversé" - echo "Début des tests sur le pagerank" diff --git a/index_inverse/.DS_Store b/index_inverse/.DS_Store index 11d75e79c8134e2c94e8c09947f1d344fad7483b..04e9c4ea12b428f6728a75f9e2717c93d5cf758f 100644 GIT binary patch delta 195 zcmZoMXfc?O$b2%XV6z~{a^}h0EMkl&CwH(ma7$EI8ycACC>R+VOR(&D*OC(E-+vO-w4 ztkR4VCP%XBAz5Q?hGq@ptjPgvQcQhvlXKW4Sf}oYyE$cYCz~|uxh401OrE@wO_uS& P=HqO3jGNgx{_+C=2n0Ip delta 214 zcmZoMXfc?O$b2HHV6z~{a^}h0EMkl&CU>$naEVt}TbS!87#bK%UdtlOIAii@7J0Lo z3=9m+48;sZ49Pj^hQZ1CxdlKGATV|X5>T1kd>5Cboctu92uDIf*&W%*3apZ>5SAUQ zG-JZ#C{{f@w#b5PVPPm?NCnzdf@)9v&B=Le60DmuLB@8oNkdty*<=|fY(B|m$GDl5 H<3B$De%w3& diff --git a/index_inverse/src/.DS_Store b/index_inverse/src/.DS_Store index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..264c3e2eae6ad24cf666c60b74bb6968d69c2b68 100644 GIT binary patch literal 6148 zcmeHKzi-n(6n>Yct*xqRgpkU3OScS373hGF+zxLl1%lHdow51!(U!aJK@;;S~4oa{q#8GRmV=MacLbc+`R638t=vC621hxB5RaS&a)*I^N zB<>;Q^29kocN?u)T~7D*8g==o^>8+Go$a0Gljp~S z)A!S}*@w?YVu8O#ZCe&E;0p#zR-2wQ{jM71hPrdGoHDEPxrp<-^7mR z?5{_Ue}BA@o%gTK!lTJ7neRJ)n58N^LB_>uiOt3eumY^WZ4|KQhO==SGILk~R^Y!< zfX)YrPUu-I4eFx<3;hH@tYWn`jOi^wInttMu{4M+Xu_l-np9z*7{a7uy|g^fVrkH% zgRqYeVMiABg(CFm*uT_t5S~FUSpinyx&kZavqty-v*qvq>q-2@3a|qIlmepK@jGo? zlD%6O7DsojL_bC+BVK86o`QkCijhlK@dI>iST89+^emPJu?CI*2xu9&UAjHu~2NHo+1YW5HK<@2yDK{Y{s(r0dp1eW_AvK4xj>{$am(+{342+ UKzW7)kiy9(Jj$D6L{=~Z06bF>>>>>> master def get_index(self): '''Obtenir le dictionnaire invert_index. @@ -73,12 +66,12 @@ class IndexInverse : def get_nb_documents(self): '''Renvoyer le nombre de documents dans ce classe ''' - return IndexInverse.docuCount + return self.docuCount def get_nb_vocabularys(self): '''Renvoyer le nombre de mots dans le dictionnaire vocabulary ''' - return IndexInverse.vocaCount + return self.vocaCount def get_vocabulary(self): '''Renvoyer le dictionnaire qui contient des mots @@ -98,7 +91,7 @@ class IndexInverse : def show_index(self): '''Imprimer le dictionnaire invert_index. ''' - print ('NbDocument='+str(IndexInverse.docuCount)) + print ('NbDocument='+str(self.docuCount)) for word, IDs in self.invert_index.items(): print('') print(word+':',end=' ') diff --git a/index_inverse/src/__pycache__/bm25.cpython-37.pyc b/index_inverse/src/__pycache__/bm25.cpython-37.pyc deleted file mode 100644 index 1276f9fc41f3a22ee1c084d42426e37b62322aff..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1469 zcmZ`(OK%%D5GJ_~NgLZO616evpdiqTwsvJpf!fAM;xsl=1ZaF|fdWc`&1y+0Tkk{W zat+I%PVvFVq8DH7V}D2f1y6;0NOSJ3rw&)Pk^n6Q4!<389-MEehl`6Ig7NE;eDT6T z=uh);H7o!RU>2W)V5pB6W+cIV@;zb}vyTz8Uy;7W$dAZ(j_!Ngf`^hbnrHNO?DH9m;XQ6EDN z#N=>{AZndC%ss|^o7EtGL)(M2+l%Z=;|8p7AG8g#U?3zq!wRg2<)SBj;!b zJ^%6kb@!}{;*~@VEn@HJ4t{||g ztGcN|*@sY%vjZ*)PIvOCOnD~hBfs3 zsa7}Iy>)tp=)3duEe7|y!pIR!7=LW}t|r-7J4amPg?3`e(?VM#F13~L3`9PH>MX=2 zDH+5WTi$fIyjCZU+Hq_Q0pJ?kKOmw3%3k-B(HVnGIBXI+w z|FhI?uNNX(H??n%C*nFNKZ02-fk5^WzK%Uy$4&ehIr6GQmHC_7)%bhsjX8$UUt5to zL+Bc;b(j@#V*rM@ktwlzcuQd*tU0ZLjwn}JhM;pYb@xB6r~#^GR8%wpUP03a&}E3i zH&GMKkaCuha+IqYE1)xaLt+z6J>>z3q`U(K28!l9?RDtI$q9YhX?NFXx4rS4ws=Oz zB9t-k`i`FgP`T0mruRSFiYdwqNs|x=&qgIL1!W~QMK^6S4;LPcSVA}HcfeRUdasXu z;MqvNT>HR5nooF^amsix_XLQWLs6b2Jd$yq&3!+70*_6uF?w&-d>1FDEjl!c*QYJG z=W54?o3GDQs}wY^Pic#Gpfv}p1B2lf>s5w6h2Pxq90>DleOH*5sGU%VaH8FCJmwh_ zhFEPCWvcBWKH~m0eM1g|G?XGf45B0~3R6(M03|7cD9o|K@6Mv%H-BrDFPqfKzo%+y`~Uy| diff --git a/index_inverse/src/bm25.py b/index_inverse/src/bm25.py index c2f0c5b..241639e 100755 --- a/index_inverse/src/bm25.py +++ b/index_inverse/src/bm25.py @@ -25,7 +25,7 @@ class bm25: # print((docuCount-df+0.5) / (df+0.5)) return idf_2 - def bm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75): + def bm_25 (nb_occ,docs_content,docuCount,idf_2,k=1.2 , b=0.75): ''' k1 ∈ [1.2, 2.0] (en pratique) b = 0.75 (en pratique) @@ -39,7 +39,13 @@ class bm25: d_longueur=np.append(d_longueur,len(d)) sum_d=np.sum(d_longueur) avg_d=sum_d/np.size(d_longueur) - bm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d)))) + bm_25=idf_2*(nb_occ*(k+1)/(nb_occ+k*(1-b+b*(d_longueur/avg_d)))) return bm_25 + + + + + + diff --git a/index_inverse/src/tf_idf.py b/index_inverse/src/tf_idf.py index d39a86f..d47da85 100755 --- a/index_inverse/src/tf_idf.py +++ b/index_inverse/src/tf_idf.py @@ -11,30 +11,41 @@ import numpy as np from collections import Counter class tf_idf: - + # matrice_tf: [n_vocab, n_doc] + # matrice_idf: [n_vocab, 1] + + def nb_occ (docs_content,vocabulary,docuCount): + ''' nombre d’occurrences de t dans d ''' + tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc] + for id, doc in zip(range(docuCount),docs_content): + counter = Counter(doc) + for v in counter.keys(): + tf_matrix[vocabulary[v],id] = counter[v] + return tf_matrix + def tf_classique (docs_content,vocabulary,docuCount): ''' nombre d’occurrences de t dans d /nombre de mots dans d''' tf_matrix = np.zeros((len(vocabulary),docuCount), dtype=np.float64) # [n_vocab, n_doc] for id, doc in zip(range(docuCount),docs_content): counter = Counter(doc) for v in counter.keys(): - tf_matrix[vocabulary[v],id] = counter[v] / counter.most_common(1)[0][1] + tf_matrix[vocabulary[v],id] = counter[v] / len(doc) return tf_matrix - - def tf_normalisation_log (tf_matrix_classique): + + def tf_normalisation_log (nb_occ): '''Normalisation Logarithmique: 1 + log ft ,d ''' - tf_matrix_log=np.log(tf_matrix_classique)+1 + tf_matrix_log=np.log(nb_occ)+1 return tf_matrix_log - def tf_normalisation_max (tf_matrix_classique): + def tf_normalisation_max (nb_occ): ''' Normalisationpar le max : 0.5 + 0.5 × ft,d/maxt′∈d ft′,d ''' - tf_matrix_max= 0.5 + 0.5 * (tf_matrix_classique / np.max(tf_matrix_classique, axis=1, keepdims=True)) + tf_matrix_max= 0.5 + 0.5 * (nb_occ / np.max(nb_occ, axis=1, keepdims=True)) return tf_matrix_max def idf (invert_index,vocabulary,vocaCount,docuCount): '''Inverse Document Frequency (IDF) - Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d}) - ''' + Soit un terme t et une collection de documents D, DFt =log(D/{d ∈ D | t apparaˆıt dans d}) + ''' df = np.zeros((vocaCount, 1)) for (key,val) in vocabulary.items(): index_val=invert_index.get(key) @@ -47,3 +58,4 @@ class tf_idf: + diff --git a/index_inverse/test/test_IndexInverse.py b/index_inverse/test/test_IndexInverse.py index 584afb8..db9bf8f 100644 --- a/index_inverse/test/test_IndexInverse.py +++ b/index_inverse/test/test_IndexInverse.py @@ -12,68 +12,68 @@ import unittest class TestIndexInverse(unittest.TestCase): def test_IndexInverse_1(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_docs_with_keyword('i') - self.assertEqual(result, [0,1,2]) - + index1=IndexInverse() + index1.ajout_url('lipn.fr_1', 'I love shanghai') + index1.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index1.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index1.create_index() + result = index1.get_nb_documents() + self.assertEqual(result, 3) + def test_IndexInverse_2(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_docs_with_keyword('like') + index2=IndexInverse() + index2.ajout_url('lipn.fr_1', 'I love shanghai') + index2.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index2.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index2.create_index() + result = index2.get_docs_with_keyword('like') self.assertEqual(result, -1) def test_IndexInverse_3(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_nb_documents() - self.assertEqual(result, 3) + index3=IndexInverse() + index3.ajout_url('lipn.fr_1', 'I love shanghai') + index3.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index3.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index3.create_index() + result = index3.get_docs_with_keyword('i') + self.assertEqual(result, [0,1,2]) def test_IndexInverse_4(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_nb_vocabularys() + index4=IndexInverse() + index4.ajout_url('lipn.fr_1', 'I love shanghai') + index4.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index4.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index4.create_index() + result = index4.get_nb_vocabularys() self.assertEqual(result, 15) def test_IndexInverse_5(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_vocabulary() + index5=IndexInverse() + index5.ajout_url('lipn.fr_1', 'I love shanghai') + index5.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index5.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index5.create_index() + result = index5.get_vocabulary() self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) def test_IndexInverse_6(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_document_id() + index6=IndexInverse() + index6.ajout_url('lipn.fr_1', 'I love shanghai') + index6.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index6.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index6.create_index() + result = index6.get_document_id() self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) def test_IndexInverse_7(self): - index=IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - result = index.get_docs_content() - self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) + index7=IndexInverse() + index7.ajout_url('lipn.fr_1', 'I love shanghai') + index7.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index7.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index7.create_index() + result = index7.get_docs_content() + self.assertEqual(result, [['i', 'love', 'shanghai'], ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']]) diff --git a/index_inverse/test/test_bm25.py b/index_inverse/test/test_bm25.py index 477c81d..a093831 100755 --- a/index_inverse/test/test_bm25.py +++ b/index_inverse/test/test_bm25.py @@ -47,21 +47,21 @@ class TestDM25(unittest.TestCase): [ 0.51082562]]) ) def test_BM5_2(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']] @@ -81,22 +81,22 @@ class TestDM25(unittest.TestCase): [ 0.51082562], [ 0.51082562], [ 0.51082562]]) - b25=bm25.bm_25 (tf_matrix_classique,docs_content,docuCount,i2) - np.testing.assert_allclose( b25, np.array([[-2.67562645, -1.86130536, -1.58555642], - [ 0.70238523, 0. , 0. ], - [-0.70238523, -0.31217121, -0. ], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [-0. , -0.31217121, -0.25541281], - [ 0. , 0.31217121, 0. ], - [-0. , -0.31217121, -0.25541281], - [ 0. , 0. , 0.41622829], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281], - [ 0. , 0. , 0.25541281]]) ) + b25=bm25.bm_25 (nb_occ,docs_content,docuCount,i2) + np.testing.assert_allclose( b25, np.array([[-2.67562645, -2.59454687, -2.31405531], + [ 0.70238523, 0. , 0. ], + [-0.70238523, -0.48861581, -0. ], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0.48861581, 0. ], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0. , 0.60746831], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829]]) ) diff --git a/index_inverse/test/test_tf_idf.py b/index_inverse/test/test_tf_idf.py index e314fae..5e2b8e4 100755 --- a/index_inverse/test/test_tf_idf.py +++ b/index_inverse/test/test_tf_idf.py @@ -19,6 +19,7 @@ class SaneEqualityArray(np.ndarray): class TestTfIdf(unittest.TestCase): def test_TfIdf_1(self): + # tf_classique docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']] @@ -26,90 +27,93 @@ class TestTfIdf(unittest.TestCase): 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} docuCount=3 tf_matrix_classique=tf_idf.tf_classique (docs_content,vocabulary,docuCount) - np.testing.assert_array_equal(tf_matrix_classique, np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]])) + np.testing.assert_allclose(tf_matrix_classique, np.array([[0.33333333, 0.2 , 0.14285714], + [0.33333333, 0. , 0. ], + [0.33333333, 0.1 , 0. ], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0.07142857], + [0. , 0.1 , 0. ], + [0. , 0.1 , 0.07142857], + [0. , 0. , 0.14285714], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857], + [0. , 0. , 0.07142857]]) ) - def test_TfIdf_2(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) - mat_nor_log=tf_idf.tf_normalisation_log (tf_matrix_classique) - np.testing.assert_allclose( mat_nor_log,np.array([[1. , 1. , 1. ], - [1. , -inf, -inf], - [1. , 0.30685282, -inf], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, 0.30685282], - [ -inf, 0.30685282, -inf], - [ -inf, 0.30685282, 0.30685282], - [ -inf, -inf, 1. ], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282], - [ -inf, -inf, 0.30685282]]) ) + def test_TfIdf_2(self): + # tf_normalisation_log + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) + mat_nor_log=tf_idf.tf_normalisation_log (nb_occ) + np.testing.assert_allclose( mat_nor_log, np.array([[1. , 1.69314718, 1.69314718], + [1. , -inf, -inf], + [1. , 1. , -inf], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , 1. ], + [ -inf, 1. , -inf], + [ -inf, 1. , 1. ], + [ -inf, -inf, 1.69314718], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ], + [ -inf, -inf, 1. ]]) ) def test_TfIdf_3(self): - tf_matrix_classique=np.array([[1. , 1. , 1. ], - [1. , 0. , 0. ], - [1. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0.5], - [0. , 0.5, 0. ], - [0. , 0.5, 0.5], - [0. , 0. , 1. ], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5], - [0. , 0. , 0.5]]) - mat_nor_max=tf_idf.tf_normalisation_max (tf_matrix_classique) - np.testing.assert_array_equal( mat_nor_max,np.array([[1. , 1. , 1. ], - [1. , 0.5 , 0.5 ], - [1. , 0.75, 0.5 ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 1. ], - [0.5 , 1. , 0.5 ], - [0.5 , 1. , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ], - [0.5 , 0.5 , 1. ]]) ) + # tf_normalisation_max + nb_occ=np.array([[1., 2., 2.], + [1., 0., 0.], + [1., 1., 0.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 1.], + [0., 1., 0.], + [0., 1., 1.], + [0., 0., 2.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.], + [0., 0., 1.]]) + mat_nor_max=tf_idf.tf_normalisation_max (nb_occ) + np.testing.assert_array_equal( mat_nor_max, np.array([[0.75, 1. , 1. ], + [1. , 0.5 , 0.5 ], + [1. , 1. , 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 1. ], + [0.5 , 1. , 0.5 ], + [0.5 , 1. , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ], + [0.5 , 0.5 , 1. ]])) def test_TfIdf_4(self): + # idf invert_index={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14} @@ -117,20 +121,20 @@ class TestTfIdf(unittest.TestCase): docuCount=3 mat_idf=tf_idf.idf(invert_index,vocabulary,vocaCount,docuCount) np.testing.assert_allclose(mat_idf,np.array([[1.46633707], - [1.60943791], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.5040774 ], - [1.60943791], - [1.5040774 ], - [1.60943791], - [1.60943791], - [1.60943791], - [1.60943791], - [1.60943791]]) ) + [1.60943791], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.5040774 ], + [1.60943791], + [1.5040774 ], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791], + [1.60943791]]) ) if __name__ == '__main__': -- GitLab From 88fdaff10916eecd291b8d36a22d95f3b25e8041 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 03:44:26 +0200 Subject: [PATCH 3/9] sorte, charger et recherche --- .DS_Store | Bin 8196 -> 8196 bytes index_inverse/.DS_Store | Bin 6148 -> 6148 bytes index_inverse/src/charger_doc.py | 47 +++++++++ index_inverse/src/recherche.py | 60 +++++++++++ index_inverse/src/sort_index.py | 56 +++++++++++ index_inverse/test/test_recherche.py | 50 +++++++++ index_inverse/test/test_sort_index.py | 140 ++++++++++++++++++++++++++ 7 files changed, 353 insertions(+) create mode 100755 index_inverse/src/charger_doc.py create mode 100755 index_inverse/src/recherche.py create mode 100755 index_inverse/src/sort_index.py create mode 100755 index_inverse/test/test_recherche.py create mode 100755 index_inverse/test/test_sort_index.py diff --git a/.DS_Store b/.DS_Store index 463be7cef12b4ecac4903e25f6c5c87eaaf988f8..b20781aacfa0e953d9d8c452ee4cdecf4977045a 100644 GIT binary patch delta 135 zcmZp1XmQw}CJ@`w#lXP8!l1{H&XCDalAG`1l9ZF51Qg@QicS-oE_B!tRXzo;d_jg` yaB_Zb0Zpq@97g8mItoUH#*^=h$WCSvE@ArcZ1O!}xy>_#b9eya(<5d8 delta 130 zcmZp1XmQw}CJ-xooPmLXg+Y%YogtH|x!t}0i@;za>%`=2^cmPe7A}#;` diff --git a/index_inverse/.DS_Store b/index_inverse/.DS_Store index 04e9c4ea12b428f6728a75f9e2717c93d5cf758f..57e7f8ff1a4a2aa8558dec8ead74f7b0ef811fa0 100644 GIT binary patch delta 108 zcmZoMXffEZl!e2{++0V&$k2H5T9zx^Tv6 tlNDGcSs^St)^o@j9VdsfNigzH&SR5cO^cpVJ8g0|n>2*Ac{N+4AOJV79+Ut8 delta 108 zcmZoMXffEZl!e34z(hyE$k=G|T9z 10 : + top10 = list() + for i in range(10): + m = self.docuCount - 1 - i + top10.append(liste_score[m][0]) + return top10 + + return liste_score diff --git a/index_inverse/src/sort_index.py b/index_inverse/src/sort_index.py new file mode 100755 index 0000000..7098aca --- /dev/null +++ b/index_inverse/src/sort_index.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 12 20:07:56 2021 + + +""" + +import numpy as np +from tf_idf import tf_idf +from bm25 import bm25 + +def sort_index (IndexInverse, standard="tf_classique/idf"): + # tf_idf=TF * IDF + # matrice_tf: [n_vocab, n_doc] + # matrice_idf: [n_vocab, 1] + # matrice_score:[n_vocab, n_doc] + if (standard=="tf_classique/idf"): + tf_matrix=tf_idf.tf_classique(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + score = tf_matrix * idf + elif (standard=="tf_normalisation_log/idf"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + tf_normalisation_log=tf_idf.tf_normalisation_log(nb_occ) + score = tf_normalisation_log * idf + elif (standard=="tf_normalisation_max/idf"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf=tf_idf.idf(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + tf_normalisation_max=tf_idf.tf_normalisation_max(nb_occ) + score = tf_normalisation_max * idf + elif (standard=="bm_25"): + nb_occ=tf_idf.nb_occ(IndexInverse.get_docs_content(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_documents()) + idf_2=bm25.idf_2(IndexInverse.get_index(),IndexInverse.get_vocabulary(),IndexInverse.get_nb_vocabularys(),IndexInverse.get_nb_documents()) + score=bm25.bm_25(nb_occ,IndexInverse.get_docs_content(),IndexInverse.get_nb_documents(),idf_2) + else: + print("Please enter the correct sort method. ") + return -1 + + sort_score=np.argsort(-score, axis=1) + for i,(word,docs) in zip (range(IndexInverse.get_nb_vocabularys()), IndexInverse.get_index().items()): + n=len(docs) + li=sort_score[i,:n] + IndexInverse.get_index()[word]=li.tolist() + return IndexInverse,score + + + + + + + + + + + diff --git a/index_inverse/test/test_recherche.py b/index_inverse/test/test_recherche.py new file mode 100755 index 0000000..a465f95 --- /dev/null +++ b/index_inverse/test/test_recherche.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 13 03:08:05 2021 + + +""" + +from IndexInverse import IndexInverse +from sort_index import sort_index +from recherche import recherche +import sys +sys.path.append('../src') +import unittest + +class TestRecherche(unittest.TestCase): + def test_Recherche_1(self): + # recherche le mot "i" + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche("i",index,score),['lipn.fr_1', 'lipn.fr_2', 'lipn.fr_3'] ) + + def test_Recherche_2(self): + # recherche mots cles ["i","am"] + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche(["i","am"],index,score),['lipn.fr_1', 'lipn.fr_2', 'lipn.fr_3'] ) + + def test_Recherche_3(self): + # recherche mots cles ["i","am","from"] + index=IndexInverse() + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.create_index() + index,score=sort_index(index) + self.assertEqual(recherche(["i","am","from"],index,score), ['lipn.fr_2', 'lipn.fr_1', 'lipn.fr_3'] ) + + +if __name__ == '__main__': + unittest.main() + \ No newline at end of file diff --git a/index_inverse/test/test_sort_index.py b/index_inverse/test/test_sort_index.py new file mode 100755 index 0000000..a570b76 --- /dev/null +++ b/index_inverse/test/test_sort_index.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 13 02:43:25 2021 + + +""" + +import numpy as np +from IndexInverse import IndexInverse +from sort_index import sort_index +from numpy import inf +import sys +sys.path.append('../src') +import unittest + +class SaneEqualityArray(np.ndarray): + def __eq__(self, other): + return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all()) + +class TestSortIndex(unittest.TestCase): + def test_SortIndex_1(self): + # Faire en sortes avec le score "tf_classique/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index) + self.assertEqual(index.get_index(), {'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[0.48877902, 0.29326741, 0.20947672], + [0.5364793 , 0. , 0. ], + [0.50135913, 0.15040774, 0. ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0.16094379, 0. ], + [0. , 0.15040774, 0.1074341 ], + [0. , 0. , 0.2299197 ], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985], + [0. , 0. , 0.11495985]]) ) + + + def test_SortIndex_2(self): + # Faire en sortes avec le score "tf_normalisation_log/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"tf_normalisation_log/idf") + self.assertEqual(index.get_index(), {'i': [1, 2, 0], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[1.46633707, 2.48272447, 2.48272447], + [1.60943791, -inf, -inf], + [1.5040774 , 1.5040774 , -inf], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, 1.60943791, -inf], + [ -inf, 1.5040774 , 1.5040774 ], + [ -inf, -inf, 2.72501526], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791], + [ -inf, -inf, 1.60943791]])) + + def test_SortIndex_3(self): + # Faire en sortes avec le score "tf_normalisation_max/idf" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"tf_normalisation_max/idf") + self.assertEqual(index.get_index(), {'i': [1, 2, 0], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], + 'now': [1, 2], 'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[1.0997528 , 1.46633707, 1.46633707], + [1.60943791, 0.80471896, 0.80471896], + [1.5040774 , 1.5040774 , 0.7520387 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.80471896, 1.60943791, 0.80471896], + [0.7520387 , 1.5040774 , 1.5040774 ], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791], + [0.80471896, 0.80471896, 1.60943791]])) + + + def test_SortIndex_4(self): + # Faire en sortes avec le score "bm_25" + index=IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + index,score=sort_index(index,"bm_25") + self.assertEqual(index.get_index(), {'i': [2, 1, 0], 'love': [0], 'shanghai': [2, 1], 'am': [0, 2], 'from': [0, 2], + 'now': [0, 2], 'study': [0, 2], 'in': [0, 2], 'tongji': [1], 'university': [0, 2], + 'lanzhou': [2], 'of': [2], 'science': [2], 'and': [2], 'technolgy': [2]} ) + + np.testing.assert_allclose(score, np.array([[-2.67562645, -2.59454687, -2.31405531], + [ 0.70238523, 0. , 0. ], + [-0.70238523, -0.48861581, -0. ], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0.48861581, 0. ], + [-0. , -0.48861581, -0.41622829], + [ 0. , 0. , 0.60746831], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829], + [ 0. , 0. , 0.41622829]])) + + + +if __name__ == '__main__': + unittest.main() + -- GitLab From fdf62b2ce2c3ded5a514fa42a6da83a4f7b2b813 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 03:44:54 +0200 Subject: [PATCH 4/9] sorte, charger et recherche --- .DS_Store | Bin 8196 -> 0 bytes index_inverse/.DS_Store | Bin 6148 -> 0 bytes index_inverse/src/.DS_Store | Bin 6148 -> 0 bytes index_inverse/test/.DS_Store | Bin 6148 -> 0 bytes pagerank/.DS_Store | Bin 6148 -> 0 bytes 5 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store delete mode 100644 index_inverse/.DS_Store delete mode 100644 index_inverse/src/.DS_Store delete mode 100644 index_inverse/test/.DS_Store delete mode 100644 pagerank/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index b20781aacfa0e953d9d8c452ee4cdecf4977045a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM&1(}u6o1o3yFnU*NUayyiwB`d)2abMh%qAGg!SW4OS;>QNp!OtvYV_LBS8eA z;JNC_o9Mx7FP`)u-WC4?@z|SJ-+b8a>^5r;LPMQ_nKzmDu{*!t&dk0I0FZLmE&$8| zfQFUI{2^?1C{#~tqD+X96DSG#gHtaUt6r;}uv7yI0fm4nLGPdg}eAxlCIxkL>X!d+2vS0Yjj z!rgIP7j`8fhg@=ZAR_r7;>tv1D1==d~N?_^>!bKu~Vmd$F}>Fl!6@IoUngRWmTJ4^J}H8W^fqJQ1mm+qaso?S*Bp>)JkxV{iq#P&Gf(?=Q^h3yTYbIlW&j&d=%R&YT?# zvf8neg$q|#S`Qxf9}OO_(^=qNBlaXX%ztmSFR&Mzr0uUd7TzYciK!&?fxJyxL^(M1 z=KS+3zdlk9VsTpDHWQ<*BtAgLF!4A&qvJZBTSpC@;50rVF4)M_eWO5Q~HY(tQ@gWdM>OtX`T zVy`ucSP6Q3x{V_}Roql-VC`3WV$+o9$$j|A=!w}x>CtnI<2Q2b{L*m$k|2w&s5Co? zp4=u~M~PT5dZ^7fl}Uaxvo!y|Jo@|pINh?k7lnXA;2$GEG8Lmz#-M-RW8?8@uB~HT z!Ac9Y8*<59uwm&qN|uhJ-1@^1`#PFZpOTP6E^!3Q9(@Q%eP>Ucv(fipl^lqzw~ OO7y7npRKFEHNOFJ)omjH diff --git a/index_inverse/.DS_Store b/index_inverse/.DS_Store deleted file mode 100644 index 57e7f8ff1a4a2aa8558dec8ead74f7b0ef811fa0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~&rjPh6vu6vjbuNx3QdBeFT3^7vL8L5Nm*A8+>kPj1DhZr9gUR6Rgj*<3*SO!qL3i@-sO@kp|2>fdVaPL-F$_DHp7?t;L9Hm*ke$AIE)`N%3 zw&U2&Q|GhSl_M|nvtiou2fJ9?_Oot4Otj5?#e>ht$9$^0nk3{-FwcA(3r zohS+As3nIUEww_Uenq+pYcHx1;fK ze)3)OB)AR4laA+WQSj@T)r!N!v)@cw>;_@ z&P(%s^#(~SlS631UuUt5bpUjBUolS>I1Bm|Kgq;bM*U^?JSWs8{}lFJ{#o2VVd&}D zzX3;|u|))E)7Xy}buvCfz!3N^3E=r4AfdXXg;sTRK&GAmpcXVMgEGD)Fh-ENq=i=S zfN>oPszW(FF}Mzgc0v7I(n6~aXPh1%oMz_qLg8w57+FnLMya@nLh#wgUJkmze?a1tLw() diff --git a/index_inverse/src/.DS_Store b/index_inverse/src/.DS_Store deleted file mode 100644 index 264c3e2eae6ad24cf666c60b74bb6968d69c2b68..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKzi-n(6n>Yct*xqRgpkU3OScS373hGF+zxLl1%lHdow51!(U!aJK@;;S~4oa{q#8GRmV=MacLbc+`R638t=vC621hxB5RaS&a)*I^N zB<>;Q^29kocN?u)T~7D*8g==o^>8+Go$a0Gljp~S z)A!S}*@w?YVu8O#ZCe&E;0p#zR-2wQ{jM71hPrdGoHDEPxrp<-^7mR z?5{_Ue}BA@o%gTK!lTJ7neRJ)n58N^LB_>uiOt3eumY^WZ4|KQhO==SGILk~R^Y!< zfX)YrPUu-I4eFx<3;hH@tYWn`jOi^wInttMu{4M+Xu_l-np9z*7{a7uy|g^fVrkH% zgRqYeVMiABg(CFm*uT_t5S~FUSpinyx&kZavqty-v*qvq>q-2@3a|qIlmepK@jGo? zlD%6O7DsojL_bC+BVK86o`QkCijhlK@dI>iST89+^emPJu?CI*2xu9&UH1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0sw6{q4JoF#8Qi5xk!6zGjK z^VYNTcI8)iJR+j=_w`I?$+YK(#ZrUE8i+x)q);5=JM87`KtV zY~)S$L;lCt>(Aru_x<_)`gH!K{oV6y@zia1aZe3kqykic3Qz$mKm~>hpl6#+ZUPyp z02QDDCk5>LP~e6&u@Cf52ZE0Pzy@hItbLXM7E1tYVjqYMOoIvxs^-L?K}Wu1T}|u* zgD&bfHy|%!Y@S~yD%MrX41HBdF!CLX%psv^&`!%r-bUN}*2l7Y2bfHm!zfj-+@ckqm -- GitLab From 1d880ab1acbb9e39c06cbd9f0176a724505f1c78 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 03:56:07 +0200 Subject: [PATCH 5/9] sorte, charger et recherche --- index_inverse/test/test_IndexInverse.py | 106 ++++++++++++------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/index_inverse/test/test_IndexInverse.py b/index_inverse/test/test_IndexInverse.py index b0171e3..46be959 100644 --- a/index_inverse/test/test_IndexInverse.py +++ b/index_inverse/test/test_IndexInverse.py @@ -1,4 +1,4 @@ -<<<<<<< HEAD + #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @@ -80,55 +80,55 @@ class TestIndexInverse(unittest.TestCase): if __name__ == '__main__': unittest.main() -======= -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sat Mar 13 18:41:56 2021 - - -""" -import sys -sys.path.append('../src') -from IndexInverse import IndexInverse -from unittest import TestCase - -class TestIndexInverse(TestCase): - index = IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - def test_IndexInverse_1(self): - result = TestIndexInverse.index.get_docs_with_keyword('i') - self.assertEqual(result, [0,1,2]) - - def test_IndexInverse_2(self): - result = TestIndexInverse.index.get_docs_with_keyword('like') - self.assertEqual(result, -1) - - def test_IndexInverse_3(self): - result = TestIndexInverse.index.get_nb_documents() - self.assertEqual(result, 3) - - def test_IndexInverse_4(self): - result = TestIndexInverse.index.get_nb_vocabularys() - self.assertEqual(result, 15) - - def test_IndexInverse_5(self): - result = TestIndexInverse.index.get_vocabulary() - self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, - 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) - - def test_IndexInverse_6(self): - result = TestIndexInverse.index.get_document_id() - self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) - - def test_IndexInverse_7(self): - result = TestIndexInverse.index.get_docs_content() - self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) - - - - ->>>>>>> master +======= +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sat Mar 13 18:41:56 2021 + + +""" +import sys +sys.path.append('../src') +from IndexInverse import IndexInverse +from unittest import TestCase + +class TestIndexInverse(TestCase): + index = IndexInverse() + index.ajout_url('lipn.fr_1', 'I love shanghai') + index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') + index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') + index.create_index() + def test_IndexInverse_1(self): + result = TestIndexInverse.index.get_docs_with_keyword('i') + self.assertEqual(result, [0,1,2]) + + def test_IndexInverse_2(self): + result = TestIndexInverse.index.get_docs_with_keyword('like') + self.assertEqual(result, -1) + + def test_IndexInverse_3(self): + result = TestIndexInverse.index.get_nb_documents() + self.assertEqual(result, 3) + + def test_IndexInverse_4(self): + result = TestIndexInverse.index.get_nb_vocabularys() + self.assertEqual(result, 15) + + def test_IndexInverse_5(self): + result = TestIndexInverse.index.get_vocabulary() + self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, + 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) + + def test_IndexInverse_6(self): + result = TestIndexInverse.index.get_document_id() + self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) + + def test_IndexInverse_7(self): + result = TestIndexInverse.index.get_docs_content() + self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) + + + + +>>>>>>> master -- GitLab From 2186cd6ed8b5d4cff8833767d7316710b1ec4b1c Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 04:01:53 +0200 Subject: [PATCH 6/9] sorte, charger et recherche --- index_inverse/test/test_IndexInverse.py | 51 ------------------------- 1 file changed, 51 deletions(-) diff --git a/index_inverse/test/test_IndexInverse.py b/index_inverse/test/test_IndexInverse.py index 46be959..fcd1502 100644 --- a/index_inverse/test/test_IndexInverse.py +++ b/index_inverse/test/test_IndexInverse.py @@ -80,55 +80,4 @@ class TestIndexInverse(unittest.TestCase): if __name__ == '__main__': unittest.main() -======= -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sat Mar 13 18:41:56 2021 - - -""" -import sys -sys.path.append('../src') -from IndexInverse import IndexInverse -from unittest import TestCase - -class TestIndexInverse(TestCase): - index = IndexInverse() - index.ajout_url('lipn.fr_1', 'I love shanghai') - index.ajout_url('lipn.fr_2', 'i am from shanghai now i study in tongji university') - index.ajout_url('lipn.fr_3', 'i am from lanzhou now i study in lanzhou university of science and technolgy') - index.create_index() - def test_IndexInverse_1(self): - result = TestIndexInverse.index.get_docs_with_keyword('i') - self.assertEqual(result, [0,1,2]) - - def test_IndexInverse_2(self): - result = TestIndexInverse.index.get_docs_with_keyword('like') - self.assertEqual(result, -1) - - def test_IndexInverse_3(self): - result = TestIndexInverse.index.get_nb_documents() - self.assertEqual(result, 3) - - def test_IndexInverse_4(self): - result = TestIndexInverse.index.get_nb_vocabularys() - self.assertEqual(result, 15) - - def test_IndexInverse_5(self): - result = TestIndexInverse.index.get_vocabulary() - self.assertEqual(result, {'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, - 'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}) - - def test_IndexInverse_6(self): - result = TestIndexInverse.index.get_document_id() - self.assertEqual(result, {0: 'lipn.fr_1', 1: 'lipn.fr_2', 2: 'lipn.fr_3'}) - - def test_IndexInverse_7(self): - result = TestIndexInverse.index.get_docs_content() - self.assertEqual(result, {0: ['i', 'love', 'shanghai'], 1: ['i','am','from', 'shanghai', 'now', 'i', 'study', 'in', 'tongji', 'university'],2: ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou', 'university', 'of', 'science', 'and', 'technolgy']}) - - - ->>>>>>> master -- GitLab From 9f549c363a47ae9e0c09af94baa7ff5da158186b Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 04:06:34 +0200 Subject: [PATCH 7/9] update index_inverse --- index_inverse/src/IndexInverse.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/index_inverse/src/IndexInverse.py b/index_inverse/src/IndexInverse.py index 9f82ca8..3a8b1fc 100644 --- a/index_inverse/src/IndexInverse.py +++ b/index_inverse/src/IndexInverse.py @@ -44,16 +44,10 @@ class IndexInverse : else: self.vocabulary[word]=a a=a+1 - self.invert_index.setdefault(word,[]).append(doc_id) -<<<<<<< HEAD + self.invert_index.setdefault(word,[]).append(doc_id) self.vocaCount=len(self.vocabulary) -======= - IndexInverse.vocaCount=len(self.vocabulary) - - ->>>>>>> master - + def get_index(self): '''Obtenir le dictionnaire invert_index. ''' -- GitLab From 74aaf6c51cdfed1edb7814daacf4ac08b5d81e57 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 04:15:05 +0200 Subject: [PATCH 8/9] update index_inverse --- index_inverse/test/test_recherche.py | 5 ++--- index_inverse/test/test_sort_index.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/index_inverse/test/test_recherche.py b/index_inverse/test/test_recherche.py index a465f95..5352053 100755 --- a/index_inverse/test/test_recherche.py +++ b/index_inverse/test/test_recherche.py @@ -5,12 +5,11 @@ Created on Tue Apr 13 03:08:05 2021 """ - +sys.path.append('../src') from IndexInverse import IndexInverse from sort_index import sort_index from recherche import recherche import sys -sys.path.append('../src') import unittest class TestRecherche(unittest.TestCase): @@ -47,4 +46,4 @@ class TestRecherche(unittest.TestCase): if __name__ == '__main__': unittest.main() - \ No newline at end of file + diff --git a/index_inverse/test/test_sort_index.py b/index_inverse/test/test_sort_index.py index a570b76..c256ea3 100755 --- a/index_inverse/test/test_sort_index.py +++ b/index_inverse/test/test_sort_index.py @@ -7,11 +7,11 @@ Created on Tue Apr 13 02:43:25 2021 """ import numpy as np +sys.path.append('../src') from IndexInverse import IndexInverse from sort_index import sort_index from numpy import inf import sys -sys.path.append('../src') import unittest class SaneEqualityArray(np.ndarray): -- GitLab From d190dfa319e68476794c2817135f2c846b5cc930 Mon Sep 17 00:00:00 2001 From: LILI JIN Date: Tue, 13 Apr 2021 04:20:46 +0200 Subject: [PATCH 9/9] update index_inverse --- index_inverse/test/test_recherche.py | 3 ++- index_inverse/test/test_sort_index.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/index_inverse/test/test_recherche.py b/index_inverse/test/test_recherche.py index 5352053..e40f960 100755 --- a/index_inverse/test/test_recherche.py +++ b/index_inverse/test/test_recherche.py @@ -5,11 +5,12 @@ Created on Tue Apr 13 03:08:05 2021 """ + +import sys sys.path.append('../src') from IndexInverse import IndexInverse from sort_index import sort_index from recherche import recherche -import sys import unittest class TestRecherche(unittest.TestCase): diff --git a/index_inverse/test/test_sort_index.py b/index_inverse/test/test_sort_index.py index c256ea3..501d154 100755 --- a/index_inverse/test/test_sort_index.py +++ b/index_inverse/test/test_sort_index.py @@ -7,11 +7,11 @@ Created on Tue Apr 13 02:43:25 2021 """ import numpy as np +import sys sys.path.append('../src') from IndexInverse import IndexInverse from sort_index import sort_index from numpy import inf -import sys import unittest class SaneEqualityArray(np.ndarray): -- GitLab