ajouter tf/idf et dm-25

eb6f8a84 · LILI JIN · c9d51346 · eb6f8a84 · eb6f8a84 · eb6f8a84
Commit eb6f8a84 authored 3 years ago by LILI JIN
--- a/.DS_Store
+++ b/.DS_Store
--- a/index_inverse/.DS_Store
+++ b/index_inverse/.DS_Store
--- a/index_inverse/src/dm25.py
+++ b/index_inverse/src/dm25.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Mar 30 19:51:28 2021
+
+@author: jinlili
+
+dm-25
+"""
+
+import numpy as np
+
+class dm25:
+
+    def idf_2 (invert_index,vocabulary,vocaCount,docuCount):
+        '''Inverse Document Frequency (IDF)
+        IDF(t) = log(|D|−|Dt|+0.5/ |Dt|+0.5 )
+         '''
+        df = np.zeros((vocaCount, 1))
+        for (key,val) in vocabulary.items():
+               index_val=invert_index.get(key)
+               if index_val !=None:
+                    df[val,0] = len(index_val)
+        idf_2=np.log((docuCount-df+0.5) / (df+0.5))
+        # print((docuCount-df+0.5) / (df+0.5))
+        return idf_2
+    
+    def dm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75):
+        '''   
+        k1 ∈ [1.2, 2.0] (en pratique)
+        b = 0.75 (en pratique)
+        |d| est la longueur du document
+        avgdl : Average Document Length, la longueur moyenne des documents de la collection
+        BM-25t,d =IDF(t)∗( ft,d ×(k1 +1)   / ft,d + k1 × ( 1−b+b× (|d|/avgdl) ) )
+        
+        '''
+        d_longueur=np.array([])
+        for d in docs_content:
+            d_longueur=np.append(d_longueur,len(d))
+        sum_d=np.sum(d_longueur)
+        avg_d=sum_d/np.size(d_longueur)
+        dm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d))))
+        return dm_25   
+
+
--- a/index_inverse/test/test_dm25.py
+++ b/index_inverse/test/test_dm25.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 31 00:08:41 2021
+
+@author: jinlili
+"""
+
+
+import numpy as np
+from numpy import inf
+import sys
+sys.path.append('../src')
+from dm25 import dm25
+import unittest
+
+
+class SaneEqualityArray(np.ndarray):
+    def __eq__(self, other):
+        return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all())
+
+
+class TestDM25(unittest.TestCase):
+    def test_DM25_1(self):
+        invert_index ={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2], 
+               'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2], 
+               'science': [2], 'and': [2], 'technolgy': [2]}                         
+        vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8, 
+     'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}
+        docuCount=3
+        vocaCount=15
+        i2=dm25.idf_2 (invert_index,vocabulary,vocaCount,docuCount)
+        np.testing.assert_allclose( i2,np.array([[-1.94591015],
+                                                       [ 0.51082562],
+                                                       [-0.51082562],
+                                                       [-0.51082562],
+                                                       [-0.51082562],
+                                                       [-0.51082562],
+                                                       [-0.51082562],
+                                                       [-0.51082562],
+                                                       [ 0.51082562],
+                                                       [-0.51082562],
+                                                       [ 0.51082562],
+                                                       [ 0.51082562],
+                                                       [ 0.51082562],
+                                                       [ 0.51082562],
+                                                       [ 0.51082562]]) )
+        
+    def test_DM5_2(self):
+        tf_matrix_classique=np.array([[1. , 1. , 1. ],
+                                      [1. , 0. , 0. ],
+                                      [1. , 0.5, 0. ],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0.5, 0. ],
+                                      [0. , 0.5, 0.5],
+                                      [0. , 0. , 1. ],
+                                      [0. , 0. , 0.5],
+                                      [0. , 0. , 0.5],
+                                      [0. , 0. , 0.5],
+                                      [0. , 0. , 0.5]])
+        docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in', 
+            'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou',
+                                      'university', 'of', 'science', 'and', 'technolgy']]
+        docuCount=3
+        i2=np.array([[-1.94591015],
+                    [ 0.51082562],
+                    [-0.51082562],
+                    [-0.51082562],
+                    [-0.51082562],
+                    [-0.51082562],
+                    [-0.51082562],
+                    [-0.51082562],
+                    [ 0.51082562],
+                    [-0.51082562],
+                    [ 0.51082562],
+                    [ 0.51082562],
+                    [ 0.51082562],
+                    [ 0.51082562],
+                    [ 0.51082562]])
+        d25=dm25.dm_25 (tf_matrix_classique,docs_content,docuCount,i2)
+        np.testing.assert_allclose( d25, np.array([[-2.67562645, -1.86130536, -1.58555642],
+                                               [ 0.70238523,  0.        ,  0.        ],
+                                               [-0.70238523, -0.31217121, -0.        ],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [ 0.        ,  0.31217121,  0.        ],
+                                               [-0.        , -0.31217121, -0.25541281],
+                                               [ 0.        ,  0.        ,  0.41622829],
+                                               [ 0.        ,  0.        ,  0.25541281],
+                                               [ 0.        ,  0.        ,  0.25541281],
+                                               [ 0.        ,  0.        ,  0.25541281],
+                                               [ 0.        ,  0.        ,  0.25541281]])   )
+            
+
+            
+            
+if __name__ == '__main__':
+    unittest.main()          
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
\ No newline at end of file