Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Julien David
lipn-search
Commits
6321deef
Commit
6321deef
authored
Apr 12, 2021
by
Hamid
Browse files
AJout de la methode search
parent
a9ed6757
Changes
2
Hide whitespace changes
Inline
Side-by-side
index_inverse/src/IndexInverse.py
0 → 100755
View file @
6321deef
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 13 16:11:24 2021
"""
class
IndexInverse
:
"""Classe pour l'index inverse :
- document_id: C'est un dictionnaire de document_id (ici url est document_id)
- docs_content: List de Contenu,le contenu de l'article sur chaque document_id
- invert_index: Ceci est un index qui sera créé par notre classe. La valeur initiale est vide.
- vocabulary: C'est un dictionnaire qui contient des mots apparus dans tous les documents.
Chaque mot a son ID. ex:{'I':0,'am':1} (Même ordre avec les mots qui sont dans invert_index )
"""
docuCount
=
0
#Le nombre de documents dans la classe
vocaCount
=
0
#Le nombre de mots dans le dictionnaire vocabulary
def
__init__
(
self
,
list_urls
,
list_content
):
'''Constructeur IndexInverse'''
if
len
(
list_urls
)
==
len
(
list_content
):
#données valides
self
.
document_id
=
dict
(
zip
(
range
(
len
(
list_urls
)),
list_urls
))
doc
=
[
c
.
lower
().
replace
(
","
,
""
).
replace
(
"/n"
,
""
).
split
(
" "
)
for
c
in
list_content
]
for
d
in
doc
:
while
''
in
d
:
d
.
remove
(
''
)
self
.
docs_content
=
doc
self
.
invert_index
=
dict
()
self
.
vocabulary
=
dict
()
IndexInverse
.
docuCount
=
len
(
list_urls
)
else
:
#erreur: la taille de list_url faut egale à la taille de list_contenu
raise
Exception
(
"Invalid parameter!"
)
def
create_index
(
self
):
''' Créer le dictionnaire invert_index.
Dans ce dictionnaire, chaque mot qui apparaît dans les documents est une clé (key).
La valeur correspondant à la clé est une liste, qui enregistre les documents (document_id) dans lesquels le mot apparaît.
'''
a
=
0
for
doc_id
,
article
in
zip
(
range
(
IndexInverse
.
docuCount
),
self
.
docs_content
):
for
word
in
article
:
if
word
in
self
.
invert_index
:
if
not
doc_id
in
self
.
invert_index
[
word
]:
self
.
invert_index
.
setdefault
(
word
,[]).
append
(
doc_id
)
else
:
self
.
vocabulary
[
word
]
=
a
a
=
a
+
1
self
.
invert_index
.
setdefault
(
word
,[]).
append
(
doc_id
)
IndexInverse
.
vocaCount
=
len
(
self
.
vocabulary
)
print
(
IndexInverse
.
vocaCount
)
def
get_index
(
self
):
'''Obtenir le dictionnaire invert_index.
'''
return
self
.
invert_index
def
get_docs_with_keyword
(
self
,
word
):
'''
Renvoyer une liste de document_id dans lesquels le mot (word) apparaît.
'''
if
word
in
self
.
invert_index
:
return
self
.
invert_index
[
word
]
else
:
print
(
'Error keyword'
)
return
-
1
def
get_nb_documents
(
self
):
'''Renvoyer le nombre de documents dans ce classe
'''
return
IndexInverse
.
docuCount
def
get_nb_vocabularys
(
self
):
'''Renvoyer le nombre de mots dans le dictionnaire vocabulary
'''
return
IndexInverse
.
vocaCount
def
get_vocabulary
(
self
):
'''Renvoyer le dictionnaire qui contient des mots
'''
return
self
.
vocabulary
def
get_document_id
(
self
):
'''Renvoyer le dictionnaire de document_id.
'''
return
self
.
document_id
def
get_docs_content
(
self
):
'''Renvoyer la list de Contenu
'''
return
self
.
docs_content
def
show_index
(
self
):
'''Imprimer le dictionnaire invert_index.
'''
print
(
'NbDocument='
+
str
(
IndexInverse
.
docuCount
))
for
word
,
IDs
in
self
.
invert_index
.
items
():
print
(
''
)
print
(
word
+
':'
,
end
=
' '
)
for
ID
in
IDs
:
print
(
ID
,
end
=
' '
)
return
def
search
(
self
,
request
):
''' C est une fonction qui retourne juste la liste des chaines de caracteres
'''
list_contenu
=
[]
list_contenu
[
0
]
=
"www."
+
request
+
".com"
+
" bonjour bienvenu dans votre site web"
return
list_contenu
index_inverse/test/test_IndexInverse.py
0 → 100755
View file @
6321deef
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 13 18:41:56 2021
"""
import
sys
sys
.
path
.
append
(
'../src'
)
from
IndexInverse
import
IndexInverse
import
unittest
class
TestIndexInverse
(
unittest
.
TestCase
):
def
test_IndexInverse_1
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_docs_with_keyword
(
'i'
)
self
.
assertEqual
(
result
,[
0
,
1
,
2
])
def
test_IndexInverse_2
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_docs_with_keyword
(
'like'
)
self
.
assertEqual
(
result
,
-
1
)
def
test_IndexInverse_3
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_nb_documents
()
self
.
assertEqual
(
result
,
3
)
def
test_IndexInverse_4
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_nb_vocabularys
()
self
.
assertEqual
(
result
,
15
)
def
test_IndexInverse_5
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_vocabulary
()
self
.
assertEqual
(
result
,
{
'i'
:
0
,
'love'
:
1
,
'shanghai'
:
2
,
'am'
:
3
,
'from'
:
4
,
'now'
:
5
,
'study'
:
6
,
'in'
:
7
,
'tongji'
:
8
,
'university'
:
9
,
'lanzhou'
:
10
,
'of'
:
11
,
'science'
:
12
,
'and'
:
13
,
'technolgy'
:
14
})
def
test_IndexInverse_6
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_document_id
()
self
.
assertEqual
(
result
,{
0
:
'lipn.fr_1'
,
1
:
'lipn.fr_2'
,
2
:
'lipn.fr_3'
})
def
test_IndexInverse_7
(
self
):
doc
=
[
'lipn.fr_1'
,
'lipn.fr_2'
,
'lipn.fr_3'
]
textes
=
[
'I love shanghai'
,
'i am from shanghai now i study in tongji university'
,
'i am from lanzhou now i study in lanzhou university of science and technolgy'
]
index
=
IndexInverse
(
doc
,
textes
)
index
.
create_index
()
result
=
index
.
get_docs_content
()
self
.
assertEqual
(
result
,[[
'i'
,
'love'
,
'shanghai'
],
[
'i'
,
'am'
,
'from'
,
'shanghai'
,
'now'
,
'i'
,
'study'
,
'in'
,
'tongji'
,
'university'
],
[
'i'
,
'am'
,
'from'
,
'lanzhou'
,
'now'
,
'i'
,
'study'
,
'in'
,
'lanzhou'
,
'university'
,
'of'
,
'science'
,
'and'
,
'technolgy'
]])
if
__name__
==
'__main__'
:
unittest
.
main
()
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment