Commit d76113ff authored by Quentin David's avatar Quentin David
Browse files

Add docs folder for documentation, description of exhaustive api calls for a...

Add docs folder for documentation, description of exhaustive api calls for a use case, adding models and API endpoints for new structure corpusProcess
parent a67814d3
// ***
// Corpus Process Controller file
// ***
const CorpusProcessModel = require('../models/CorpusProcessing.js');
const responseHelper = require('./lib/responseHelper');
const errorHandler = responseHelper.errorHandler;
const successHandler = responseHelper.successHandler;
const failHandler = responseHelper.failHandler; // For eg. "User not found" is a fail, not an error
// DEBUG: find all corpus processes
exports.debug_corpus_process_list = function(req, res) {
CorpusProcessModel.find({},(error, corpusProcesses) => {
if (error) { errorHandler(res, error); }
else {
successHandler(res, { corpusProcesses: corpusProcesses } );
}
})
}
// Add a new corpus process
exports.corpus_process_create = function(req,res) {
const newCorpusProcess = CorpusProcessModel({
// TO BE FILLED
});
newCorpusProcess.save((error, corpusProcess) => {
if (error) { errorHandler(res, error); }
else {
successHandler(res, { corpusProcess: corpusProcess}, 201);
}
})
}
// Get an existing corpus process
exports.corpus_process_get = function(req,res) {
const corpusProcessId = req.params.corpusProcessId;
CorpusProcessModel.findById(corpusProcessId, (error, corpusProcess) => {
if (error) { errorHandler(res, error); }
if (!corpusProcess) {
failHandler(res, { corpus: `No corpus process was found with id: ${corpusProcessId}`}, 404);
}
else {
successHandler(res, { corpusProcess: corpusProcess});
}
})
}
// Modify an existing corpus
exports.corpus_process_modify = function(req,res) {
const corpusProcessId = req.params.corpusProcessId;
CorpusProcessModel.findByIdAndUpdate(corpusProcessId, req.body, (error, corpusProcess) => {
if (error) { errorHandler(res, error); }
if (!corpusProcess) {
failHandler(res, { corpusProcess: `No corpus process was found with id: ${corpusProcessId}`}, 404);
}
else {
successHandler(res, { corpusProcess: corpusProcess});
}
})
}
// Delete an existing corpus
exports.corpus_delete = function(req,res) {
const corpusProcessId = req.params.corpusProcessId;
CorpusProcessModel.findByIdAndDelete(corpusProcessId, (error, corpusProcess) => {
if (error) { errorHandler(res, error); }
if (!corpusProcess) {
failHandler(res, { corpusProcess: `No corpus process was found with id: ${corpusProcessId}`}, 404);
}
else {
successHandler(res, { corpusProcess: null})
}
});
}
// Get the status of an existing corpus process
exports.corpus_process_status_get = function(req,res) {
const corpusProcessId = req.params.corpusProcessId;
CorpusProcessModel.findById(corpusProcessId, (error, corpusProcess) => {
if (error) { errorHandler(res, error); }
if (!corpusProcess) {
failHandler(res, { corpusProcess: `No corpus process was found with id: ${corpusProcesId}`}, 404)
}
else {
//TO BE FILLED
successHandler(res, { corpusProcess: { status: 'TBD'} });
}
})
}
// Get all corpus processes from a user
exports.corpus_process_user_get = function(req, res) {
const userId = req.params.userId;
CorpusProcessModel.find({ userId: userId}, (error, corpusProcesses) => {
if (error) { errorHandler(res, error); }
if (!corpusProcesses) {
failHandler(res, { corpus: `No corpus processes were found with id: ${corpusProcessId}`}, 404);
}
else {
successHandler(res, { corpusProcesses: corpusProcesses})
}
})
}
\ No newline at end of file
# Step by Step API calls for a complete user experience
## Description of the use case
User will set as a corpus three articles of "La voix du Nord".
It will applies onto it POS Tagging, Neologisms detection with Neoveille & motifs detection with SDMC.
## I: User set their corpus.
POST /corpora
Request Body:
``` javascript
{
documents: [
{
corpusId: null, // Pas encore créé du coup....
source: '/article1.txt',
conlluBuffer: null // TO BE FILLED,maybe later?
},
{
corpusId: null, // Pas encore créé du coup....
source: '/article2.txt',
conlluBuffer: null // TO BE FILLED,maybe later?
},
{
corpusId: null, // Pas encore créé du coup....
source: '/article3.txt',
conlluBuffer: null // TO BE FILLED,maybe later?
}
],
createdBy: 'myUserId',
type: 'public',
creationDate: '12/10/2020 - 15h44',
metadata: {
author: 'La Voix du Nord',
title: 'Trois articles récents du journal « La voix du Nord »',
description: "Ces trois articles sont pris au hasard et ont été récupéré grâce à l'outil SOLR de Néoveille. Ils servent comme dummy corpus",
date: null,
type: 'Article de journal',
size: null,
language: 'fr',
userMetadata: []
}
}
```
System will now maybe check all the informations, then try to fill conlluBuffer of each documents by tokenizing the corpus. It will also need to compute the size of the corpus.
From now on, this document (as in a mongo document) will bear the id "corpusId".
## II: User set their pipeline
POST /pipelines
Request body:
``` javascript
{
preTreatments: {
sentenceSegmentation: false,
wordSegmentation: true,
posTagger: true,
conversionToUTF: false
},
// Processes TO BE FILLED
processes: [
{
moduleName: 'Neoveille',
moduleParameters: [
{
name:,
value:
}
]
},
{
moduleName: 'SDMC',
moduleParameters: [
{
name:,
value:
}
]
}
],
creationDate: '12/10/2020 - 15h52',
description: "Cette chaîne de traitement permet d'identifier les néologismes ainsi que les motifs récurrents."
}
```
This pipeline, created as a document will now bear the id "pipelineId".
## III: System creates a corpus process and starts it
Now that the system registered both the corpus and the pipeline. It can create the corpus process linking those two and start it.
POST /corpusProcesses
Request body:
``` javascript
{
corpusId: corpusId,
pipelineId: pipelineId,
userId: 'myUserId',
conllu: null, // Empty until filled by modules
annotatedDocuments: [
{
documentId: article1DocumentId,
corpusProcessId: null, // Same issue than in I: id not yet created
annotations: []
}
],
outputs: null,
currentProcessModule: null,
status: 'Not started yet'
}
```
### Pre-Treatments executions
Once it is added to the collection, it will fetch the preTreatments in the document of the corresponding pipeline.
``` javascript
preTreatments: {
sentenceSegmentation: false,
wordSegmentation: true,
posTagger: true,
conversionToUTF: false
}
```
wordSegmentation is supposed to be already done as we posted the corpus.
So the system will need to apply the POS Tagger to the corpus.
**TO BE FILLED**
```
I guess it needs to call /modules/treeTagger,
fetch /corpora/corpusId or /corpusProcess/corpusProcessId and get its conllu which would be the concatenated version.
Then apply the process, then returning a conllu column that we can add to corpusProcess.conllu.
```
### First module execution
Once it is added to the collection, it will fetch the first process in the list of the corresponding pipelineId and call the module, then update:
POST /modules/Neoveille
Request body:
``` javascript
{
// TO BE FILLED
moduleParameters: [
{
name:,
value:
}
]
}
```
### At the same time: update of corpusProcess
PUT /corpusProcess/corpusProcessId
Request body:
``` javascript
{
currentProcessingModule: 'Neoveille',
status: 'Started'
}
```
\ No newline at end of file
...@@ -7,109 +7,7 @@ ...@@ -7,109 +7,7 @@
const mongoose = require('mongoose'); const mongoose = require('mongoose');
const Schema = mongoose.Schema; const Schema = mongoose.Schema;
const ConlluColumnSchema = require('./CorpusProcessing')['ConlluColumnSchema']
/**
* @swagger
* components:
* schemas:
* Output:
* type: object
* description: data returned by a module at the end of a process
* required:
* - processId
* - moduleName
* - content.data
* properties:
* processId:
* type: String
* description: id of process that generated that output
* moduleName:
* type: String
* description: name of the module that generated that output
* content:
* type: object
* description: contains the information of the output
* properties:
* title:
* type: String
* description: a text given to describe the output
* example: Neologisms found by Neoveille
* description:
* type: String
* description: more text to detail the output
* example: 'This is the result of the module SDMC given those parameters: ....'
* data:
* type: String
* description: the content produced by the module
* example: macronisme\nadulescent\ncapilotracté
*/
// Output returned by a module
// Can be a text file or something else like a model.
/* Should there be documentId or corpusId? */
const OutputSchema = new Schema({
processId: { type: String, required: true},
moduleName: { type: String, required: true}, // Can be built from processId
content: {
title: { type: String },
description: { type: String },
data: { type: String, required: true}
}
});
/**
* @swagger
* components:
* schemas:
* Annotation:
* type: object
* description: contains the annotation produced for a given process for a single document
* required:
* - documentId
* - processId
* - moduleName
* - content.data
* properties:
* documentId:
* type: String
* description: id of the document whose annotation is attached
* processId:
* type: String
* description: id of the process whose annotation is attached
* moduleName:
* type: String
* description: name of the module that produced this annotation
* content:
* type: object
* description: contains the information of the annotation
* properties:
* title:
* type: String
* description: a text given to describe the annotation
* description:
* type: String
* description: more text to detail the annotation
* data:
* type: String
* description: the actual annotations produced by a process
* color:
* type: String
* description: the color that will be used to highlight annotated tokens
*/
// An annotation is the conjunction of a module and a document.
// It must contains all the informations needed for the visualisation tool to work.
const AnnotationSchema = new Schema({
documentId: { type: String, required: true}, // Needed ?
processId: { type: String, required: true},
moduleName: { type: String, required: true}, // Can be built from processId
content: {
title: { type: String },
description: { type: String },
data: { type: String, required: true}
},
color: { type: String }
})
/** /**
* @swagger * @swagger
...@@ -124,12 +22,10 @@ const AnnotationSchema = new Schema({ ...@@ -124,12 +22,10 @@ const AnnotationSchema = new Schema({
* description: the corpus from which the document comes * description: the corpus from which the document comes
* source: * source:
* type: String * type: String
* description: the original text of the document, unannotated * description: the original text of the document, unannotated or path of the original file
* annotations: * conlluBuffer:
* type: array * $ref: "#/components/schemas/ConlluColumn"
* description: all the annotations of a document * description: tokenization of the corpus, so only the form.
* items:
* $ref: "#/components/schemas/Annotation"
*/ */
// Documents are build at the end of the pipeline // Documents are build at the end of the pipeline
...@@ -137,37 +33,8 @@ const AnnotationSchema = new Schema({ ...@@ -137,37 +33,8 @@ const AnnotationSchema = new Schema({
// the annotations added by a pipeline // the annotations added by a pipeline
const DocumentSchema = new Schema({ const DocumentSchema = new Schema({
corpusId: { type: String, required: true}, // Needed ? corpusId: { type: String, required: true}, // Needed ?
source: { type: String }, source: { type: String }, // path of the original document?
annotations: { type: [AnnotationSchema] } conlluBuffer: { type: ConlluColumnSchema },
})
/**
* @swagger
* components:
* schemas:
* ConlluColumn:
* type: object
* description: the content of a column following ConLLu-Plus conventions
* required:
* - columnTitle
* - columnData
* properties:
* columnTitle:
* type: String
* description: the text corresponding to the head of the column
* example: UPOS
* columnData:
* type: String
* description: the data corresponding to the conllu annotation, where each row is the annotation of a token
* example: ADV\nVERB\nPRON\nNUM\nNOUN\nPUNCT
*
*/
// The content of a column with ConLLU-Plus style, each row corresponding to a token.
const ConlluColumnSchema = new Schema({
columnId: {type: String }, // Needed ?
columnTitle: { type: String, required: true},
columnData: { type: String, required: true}
}) })
/** /**
...@@ -202,6 +69,10 @@ const ConlluColumnSchema = new Schema({ ...@@ -202,6 +69,10 @@ const ConlluColumnSchema = new Schema({
* type: String * type: String
* description: weight of the corpus in bytes * description: weight of the corpus in bytes
* example: 20Mb * example: 20Mb
* language:
* type: String
* description: language of the corpus, can be multilingual
* example: fr
* userMetadata: * userMetadata:
* type: array * type: array
* description: list of metadata added by the user * description: list of metadata added by the user
...@@ -226,6 +97,7 @@ const MetadataSchema = new Schema({ ...@@ -226,6 +97,7 @@ const MetadataSchema = new Schema({
date: {type: Date }, date: {type: Date },
type: {type: String }, type: {type: String },
size: {type: String }, size: {type: String },
language: {type: String },
userMetadata: [{ userMetadata: [{
name: {type: String }, name: {type: String },
value: {type: String } value: {type: String }
...@@ -240,19 +112,6 @@ const MetadataSchema = new Schema({ ...@@ -240,19 +112,6 @@ const MetadataSchema = new Schema({
* type: object * type: object
* description: a text that will be processed through a pipeline * description: a text that will be processed through a pipeline
* properties: * properties:
* pipelineId:
* type: String
* description: id of the pipeline that processes that corpus
* conlluColumns:
* type: array
* description: list of columns, representing the whole conllu file
* items:
* $ref: "#/components/schemas/ConlluColumn"
* outputs:
* type: array
* description: list of outputs produced by the pipeline that processed that corpus
* items:
* $ref: "#/components/schemas/Output"
* documents: * documents:
* type: array * type: array
* description: list of documents that compose the corpus, used for the visualisation tool * description: list of documents that compose the corpus, used for the visualisation tool
...@@ -261,6 +120,12 @@ const MetadataSchema = new Schema({ ...@@ -261,6 +120,12 @@ const MetadataSchema = new Schema({
* createdBy: * createdBy:
* type: String * type: String
* description: '"userId" if created by a user or "public"' * description: '"userId" if created by a user or "public"'
* type:
* type: String
* description: whether the corpus is publicly available or private
* enum:
* - public
* - private
* creationDate: * creationDate:
* type: Date * type: Date
* description: the date of creation of the corpus to the database * description: the date of creation of the corpus to the database
...@@ -268,17 +133,18 @@ const MetadataSchema = new Schema({ ...@@ -268,17 +133,18 @@ const MetadataSchema = new Schema({
* $ref: "#/components/schemas/Metadata" * $ref: "#/components/schemas/Metadata"
*/ */
// A text that will be used for a pipeline // Representation of one or several documents
const CorpusSchema = new Schema({ const CorpusSchema = new Schema({
//corpusId: { type: String},
pipelineId: { type: String }, // What happens if there is two pipeline
conlluColumns: { type: [ConlluColumnSchema]},
outputs: { type: [OutputSchema]},
documents: { type: [DocumentSchema]}, documents: { type: [DocumentSchema]},
createdBy: { type: String}, // userId or public createdBy: { type: String}, // userId
type: {
type: String,
enum:['public', 'private']
},
creationDate : { type: Date }, creationDate : { type: Date },
metadata: { type: MetadataSchema } metadata: { type: MetadataSchema }
}); });
// Let's compile as a model // Let's compile as a model
const CorpusModel = mongoose.model('CorpusModel', CorpusSchema); const CorpusModel = mongoose.model('CorpusModel', CorpusSchema);
......
// ***
// CorpusProcessing Model file
// ***
const mongoose = require('mongoose');
const Schema = mongoose.Schema;
/**
* @swagger
* components:
* schemas:
* Output:
* type: object
* description: data returned by a module at the end of a process
* required:
* - processId
* - moduleName
* - content.data
* properties:
* processId:
* type: String
* description: id of process that generated that output
* moduleName:
* type: String
* description: name of the module that generated that output
* content:
* type: object
* description: contains the information of the output
* properties:
* title:
* type: String
* description: a text given to describe the output
* example: Neologisms found by Neoveille
* description:
* type: String
* description: more text to detail the output
* example: 'This is the result of the module SDMC given those parameters: ....'
* data:
* type: String
* description: the content produced by the module
* example: macronisme\nadulescent\ncapilotracté
*/
// Output returned by a module
// Can be a text file or something else like a model.
/* Should there be documentId or corpusId? */
const OutputSchema = new Schema({
processId: { type: String, required: true},
moduleName: { type: String, required: true}, // Can be built from processId
content: {
title: { type: String },
description: { type: String },
data: { type: String, required: true}
}
});
/**
* @swagger
* components:
* schemas:
* Annotation:
* type: object
* description: contains the annotation produced for a given process for a single document
* required:
* - documentId
* - processId
* - moduleName
* - content.data
* properties:
* documentId:
* type: String
* description: id of the document whose annotation is attached
* processId:
* type: String
* description: id of the process whose annotation is attached
* conlluColumnId:
* type: String
* description: id of the conllu column whose annotation is attached
* moduleName:
* type: String
* description: name of the module that produced this annotation
* content:
* type: object