Skip to content
Snippets Groups Projects
Commit eb6f8a84 authored by LILI JIN's avatar LILI JIN
Browse files

ajouter tf/idf et dm-25

parent c9d51346
Branches methode_search
No related tags found
1 merge request!19Mehdi lili index inverse
.DS_Store 0 → 100644
File added
File added
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 30 19:51:28 2021
@author: jinlili
dm-25
"""
import numpy as np
class dm25:
def idf_2 (invert_index,vocabulary,vocaCount,docuCount):
'''Inverse Document Frequency (IDF)
IDF(t) = log(|D|−|Dt|+0.5/ |Dt|+0.5 )
'''
df = np.zeros((vocaCount, 1))
for (key,val) in vocabulary.items():
index_val=invert_index.get(key)
if index_val !=None:
df[val,0] = len(index_val)
idf_2=np.log((docuCount-df+0.5) / (df+0.5))
# print((docuCount-df+0.5) / (df+0.5))
return idf_2
def dm_25 (tf_matrix_classique,docs_content,docuCount,idf_2,k=1.2 , b=0.75):
'''
k1 ∈ [1.2, 2.0] (en pratique)
b = 0.75 (en pratique)
|d| est la longueur du document
avgdl : Average Document Length, la longueur moyenne des documents de la collection
BM-25t,d =IDF(t)∗( ft,d ×(k1 +1) / ft,d + k1 × ( 1−b+b× (|d|/avgdl) ) )
'''
d_longueur=np.array([])
for d in docs_content:
d_longueur=np.append(d_longueur,len(d))
sum_d=np.sum(d_longueur)
avg_d=sum_d/np.size(d_longueur)
dm_25=idf_2*(tf_matrix_classique*(k+1)/(tf_matrix_classique+k*(1-b+b*(d_longueur/avg_d))))
return dm_25
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 31 00:08:41 2021
@author: jinlili
"""
import numpy as np
from numpy import inf
import sys
sys.path.append('../src')
from dm25 import dm25
import unittest
class SaneEqualityArray(np.ndarray):
def __eq__(self, other):
return (isinstance(other, SaneEqualityArray) and self.shape == other.shape and np.ndarray.__eq__(self, other).all())
class TestDM25(unittest.TestCase):
def test_DM25_1(self):
invert_index ={'i': [0, 1, 2], 'love': [0], 'shanghai': [0, 1], 'am': [1, 2], 'from': [1, 2], 'now': [1, 2],
'study': [1, 2], 'in': [1, 2], 'tongji': [1], 'university': [1, 2], 'lanzhou': [2], 'of': [2],
'science': [2], 'and': [2], 'technolgy': [2]}
vocabulary={'i': 0, 'love': 1, 'shanghai': 2, 'am': 3, 'from': 4, 'now': 5, 'study': 6, 'in': 7, 'tongji': 8,
'university': 9, 'lanzhou': 10, 'of': 11, 'science': 12, 'and': 13, 'technolgy': 14}
docuCount=3
vocaCount=15
i2=dm25.idf_2 (invert_index,vocabulary,vocaCount,docuCount)
np.testing.assert_allclose( i2,np.array([[-1.94591015],
[ 0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[ 0.51082562],
[-0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562]]) )
def test_DM5_2(self):
tf_matrix_classique=np.array([[1. , 1. , 1. ],
[1. , 0. , 0. ],
[1. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0.5],
[0. , 0.5, 0. ],
[0. , 0.5, 0.5],
[0. , 0. , 1. ],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5],
[0. , 0. , 0.5]])
docs_content=[['i', 'love', 'shanghai'], ['i', 'am', 'from', 'shanghai', 'now', 'i', 'study', 'in',
'tongji', 'university'], ['i', 'am', 'from', 'lanzhou', 'now', 'i', 'study', 'in', 'lanzhou',
'university', 'of', 'science', 'and', 'technolgy']]
docuCount=3
i2=np.array([[-1.94591015],
[ 0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[-0.51082562],
[ 0.51082562],
[-0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562],
[ 0.51082562]])
d25=dm25.dm_25 (tf_matrix_classique,docs_content,docuCount,i2)
np.testing.assert_allclose( d25, np.array([[-2.67562645, -1.86130536, -1.58555642],
[ 0.70238523, 0. , 0. ],
[-0.70238523, -0.31217121, -0. ],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[-0. , -0.31217121, -0.25541281],
[ 0. , 0.31217121, 0. ],
[-0. , -0.31217121, -0.25541281],
[ 0. , 0. , 0.41622829],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281],
[ 0. , 0. , 0.25541281]]) )
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment