Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/paragraph-extraction #7623

Open
wants to merge 21 commits into
base: production
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2795296
Add zod library to dependencies and update yarn.lock
Joao-vi Jan 28, 2025
fbc0911
Implement AbstractController and UseCase interface for API structure
Joao-vi Jan 28, 2025
9fbd629
Add method to filter properties by type in Template class
Joao-vi Jan 28, 2025
94cf5a0
Add ExtractorDataSource interface for extractor creation
Joao-vi Jan 28, 2025
ae3b4b8
Add Extractor class with validation and status management
Joao-vi Jan 28, 2025
b4e68ac
Add CreateExtractorUseCase for creating extractors with template vali…
Joao-vi Jan 28, 2025
5ef3536
Add extractor ID generation and update Extractor class to include ID …
Joao-vi Jan 28, 2025
a3d5753
Add MongoDB data source and error handling for extractors
Joao-vi Jan 29, 2025
45db0cf
Fix import path for CreateExtractorUseCase in specs
Joao-vi Jan 29, 2025
2e3449d
Add getAll method to TemplatesDataSource and MongoTemplatesDataSource…
Joao-vi Jan 29, 2025
a492fd8
fix eslint rule and change naming conventions
daneryl Jan 30, 2025
1e2c7ab
IdGenerator as dependency instead of DS.nextId
daneryl Jan 30, 2025
d06c277
getAllFrom on testingEnvironment to get data from db
daneryl Jan 30, 2025
cb08752
unified PX validation errors in a single class
daneryl Jan 30, 2025
1f4feb1
wip extract paragraphs
Joao-vi Feb 6, 2025
4537ac5
Merge branch 'production' of github.com:huridocs/uwazi into feat/para…
Joao-vi Feb 6, 2025
526a037
fix: update Document instantiation in S3FileStorage.spec.ts to includ…
Joao-vi Feb 6, 2025
ed99d0d
fix: update LanguageISO6391 type usage in FilesMappers and commonSchemas
Joao-vi Feb 6, 2025
317a121
fix: enforce required ISO639_1 field in language schemas and update u…
Joao-vi Feb 6, 2025
3b9c331
Revert "fix: enforce required ISO639_1 field in language schemas and …
Joao-vi Feb 6, 2025
6919c30
feat: introduce FileType and HttpClientFactory, enhance Segmentation …
Joao-vi Feb 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,11 @@ module.exports = {
excludedFiles: './**/*.cy.tsx',
parser: '@typescript-eslint/parser',
parserOptions: { project: './tsconfig.json' },
rules: { ...rules },
rules: {
...rules,
'no-empty-function': ['error', { allow: ['constructors'] }],
'no-useless-constructor': 'off',
},
},
{
files: ['./cypress/**/*.ts', './cypress/**/*.d.ts', './**/*.cy.tsx'],
Expand Down
38 changes: 38 additions & 0 deletions app/api/common.v2/AbstractController.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { NextFunction, Request, Response } from 'express';

export type Dependencies = {
next: NextFunction;
res: Response;
};

export abstract class AbstractController {
protected next: NextFunction;

protected res: Response;

constructor({ next, res }: Dependencies) {
this.next = next;
this.res = res;
}

abstract handle(request: Request): Promise<void>;

serverError(error: Error) {
// Logging ?

return this.res.status(500).json({
message: error.message,
});
}

clientError(message: string) {
// Should we log this ?
// What about negative impacts spam on Notifications channel ?
return this.res.status(400).json({ message });
}

jsonResponse(body: any) {
this.res.status(200).json(body);
this.next();
}
}
41 changes: 41 additions & 0 deletions app/api/common.v2/contracts/HttpClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { Readable } from 'stream';

type PostFormDataInput = {
url: string;
formData: Record<string, any>;
};

interface HttpClient {
postFormData<T>(input: PostFormDataInput): Promise<T>;
}

type Source = Readable;

type HttpFileProps = {
filename: string;
source: Source;
};

class HttpFile {
filename: string;

source: Source;

constructor(props: HttpFileProps) {
this.filename = props.filename;
this.source = props.source;
}

async toBuffer(): Promise<Buffer> {
return new Promise((resolve, reject) => {
const _buf: Buffer[] = [];
this.source.on('data', (chunk: any) => _buf.push(chunk));
this.source.on('end', () => resolve(Buffer.concat(_buf)));
this.source.on('error', (err: unknown) => reject(err));
});
}
}

export type { PostFormDataInput as PostInput, HttpClient };

export { HttpFile };
3 changes: 3 additions & 0 deletions app/api/common.v2/contracts/UseCase.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export interface UseCase<Input, Output> {
execute(input: Input): Promise<Output>;
}
10 changes: 6 additions & 4 deletions app/api/common.v2/database/MongoDataSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ export abstract class MongoDataSource<TSchema extends Document = Document> {
this.transactionManager = transactionManager;
}

protected getCollection(collectionName = this.collectionName) {
return new SyncedCollection<TSchema>(
new SessionScopedCollection<TSchema>(
this.db.collection<TSchema>(collectionName),
protected getCollection<Collection extends Document = TSchema>(
collectionName = this.collectionName
) {
return new SyncedCollection<Collection>(
new SessionScopedCollection<Collection>(
this.db.collection<Collection>(collectionName),
this.transactionManager
),
this.transactionManager,
Expand Down
7 changes: 7 additions & 0 deletions app/api/common.v2/infrastructure/HttpClientFactory.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import { SuperAgentHttpClient } from './SuperAgentHttpClient';

export class HttpClientFactory {
static createDefault() {
return new SuperAgentHttpClient();
}
}
59 changes: 59 additions & 0 deletions app/api/common.v2/infrastructure/SuperAgentHttpClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import superagent from 'superagent';
import { HttpClient, HttpFile, PostInput } from '../contracts/HttpClient';

export class SuperAgentHttpClient implements HttpClient {
private client = superagent;

async postFormData<T>({ url, formData }: PostInput): Promise<T> {
const request = this.client.post(url);

await this.appendFormDataToRequest(request, formData);

const response = await request;

return response.body as T;
}

private async appendFormDataToRequest(
request: superagent.Request,
formData: Record<string, any>
) {
const appendPromises = Object.entries(formData).map(async ([key, value]) =>
this.appendFormData(request, key, value)
);
await Promise.all(appendPromises);
}

private async appendFormData(request: superagent.Request, key: string, value: any) {
if (value instanceof HttpFile) {
await this.attachFile(request, key, value);
} else if (Array.isArray(value)) {
await this.appendArray(request, key, value);
} else if (typeof value === 'object' && value !== null) {
this.appendObject(request, key, value);
} else {
this.appendPrimitive(request, key, value);
}
}

private async attachFile(request: superagent.Request, key: string, file: HttpFile) {
const fileBuffer = await file.toBuffer();
// eslint-disable-next-line @typescript-eslint/no-floating-promises
request.attach(key, fileBuffer, file.filename);
}

private async appendArray(request: superagent.Request, key: string, array: any[]) {
const appendPromises = array.map(async item => this.appendFormData(request, key, item));
await Promise.all(appendPromises);
}

private appendObject(request: superagent.Request, key: string, object: object) {
// eslint-disable-next-line @typescript-eslint/no-floating-promises
request.field(key, JSON.stringify(object));
}

private appendPrimitive(request: superagent.Request, key: string, value: any) {
// eslint-disable-next-line @typescript-eslint/no-floating-promises
request.field(key, value);
}
}
7 changes: 6 additions & 1 deletion app/api/files.v2/contracts/FilesDataSource.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { ResultSet } from 'api/common.v2/contracts/ResultSet';
import { UwaziFile } from '../model/UwaziFile';
import { Segmentation } from '../model/Segmentation';
import { Document } from '../model/Document';

export interface FilesDataSource {
interface FilesDataSource {
filesExistForEntities(files: { entity: string; _id: string }[]): Promise<boolean>;
getAll(): ResultSet<UwaziFile>;
getSegmentations(fileId: string[]): ResultSet<Segmentation>;
getDocumentsForEntity(entitySharedId: string): ResultSet<Document>;
}
export type { FilesDataSource };
19 changes: 7 additions & 12 deletions app/api/files.v2/database/FilesMappers.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// import { OptionalId } from 'mongodb';

import { LanguageUtils } from 'shared/language';
import { FileDBOType } from './schemas/filesTypes';
import { UwaziFile } from '../model/UwaziFile';
import { Document } from '../model/Document';
Expand All @@ -8,15 +7,6 @@ import { Attachment } from '../model/Attachment';
import { CustomUpload } from '../model/CustomUpload';

export const FileMappers = {
// toDBO(file: UwaziFile): OptionalId<FileDBOType> {
// return {
// filename: file.filename,
// entity: file.entity,
// type: 'document',
// totalPages: file.totalPages,
// };
// },

toModel(fileDBO: FileDBOType): UwaziFile {
if (fileDBO.type === 'attachment' && fileDBO.url) {
return new URLAttachment(
Expand All @@ -43,11 +33,16 @@ export const FileMappers = {
fileDBO.filename
).withCreationDate(new Date(fileDBO.creationDate));
}
return this.toDocumentModel(fileDBO);
},

toDocumentModel(fileDBO: FileDBOType) {
return new Document(
fileDBO._id.toString(),
fileDBO.entity,
fileDBO.totalPages,
fileDBO.filename
fileDBO.filename,
LanguageUtils.fromISO639_3(fileDBO.language).ISO639_1!
).withCreationDate(new Date(fileDBO.creationDate));
},
};
34 changes: 32 additions & 2 deletions app/api/files.v2/database/MongoFilesDataSource.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,51 @@
import { MongoDataSource } from 'api/common.v2/database/MongoDataSource';
import { MongoResultSet } from 'api/common.v2/database/MongoResultSet';
import { SegmentationType } from 'shared/types/segmentationType';
import { ObjectId } from 'mongodb';
import { ResultSet } from 'api/common.v2/contracts/ResultSet';
import { FilesDataSource } from '../contracts/FilesDataSource';
import { FileMappers } from './FilesMappers';
import { FileDBOType } from './schemas/filesTypes';
import { UwaziFile } from '../model/UwaziFile';
import { Segmentation } from '../model/Segmentation';
import { Document } from '../model/Document';
import { SegmentationMapper } from './SegmentationMapper';

export type SegmentationDBO = SegmentationType & {
_id: ObjectId;
fileID: ObjectId;
};

export class MongoFilesDataSource extends MongoDataSource<FileDBOType> implements FilesDataSource {
protected collectionName = 'files';

getSegmentations(filesId: string[]): ResultSet<Segmentation> {
const cursor = this.getCollection<SegmentationDBO>('segmentations').find({
fileID: { $in: filesId.map(id => new ObjectId(id)) },
status: 'ready',
segmentation: { $exists: true },
});

return new MongoResultSet(cursor, SegmentationMapper.toDomain);
}

getDocumentsForEntity(entitySharedId: string): ResultSet<Document> {
return new MongoResultSet<FileDBOType, Document>(
this.getCollection().find(
{ entity: entitySharedId, type: 'document' },
{ projection: { fullText: 0 } }
),
FileMappers.toDocumentModel
);
}

getAll() {
return new MongoResultSet<FileDBOType, UwaziFile>(
this.getCollection().find({}, { projection: { fullText: 0 } }),
FileMappers.toModel
);
}

protected collectionName = 'files';

async filesExistForEntities(files: { entity: string; _id: string }[]) {
const query = {
$or: files.map(file => ({ _id: new ObjectId(file._id), entity: file.entity })),
Expand Down
25 changes: 25 additions & 0 deletions app/api/files.v2/database/SegmentationMapper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { Segmentation } from '../model/Segmentation';
import { SegmentationDBO } from './MongoFilesDataSource';

export class SegmentationMapper {
static toDomain(dbo: SegmentationDBO): Segmentation {
return {
id: dbo._id.toString(),
fileId: dbo.fileID.toString(),
status: dbo.status!,
filename: dbo.filename!,
pageHeight: dbo.segmentation?.page_height,
pageWidth: dbo.segmentation?.page_width,
paragraphs: dbo.segmentation?.paragraphs?.map(item => ({
height: item.height!,
left: item.left!,
pageNumber: item.page_number!,
text: item.text!,
top: item.top!,
width: item.width!,
})),
autoExpire: dbo.autoexpire!,
xmlname: dbo.xmlname,
};
}
}
1 change: 1 addition & 0 deletions app/api/files.v2/database/schemas/filesTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ interface BaseFileDBOType {
interface DocumentFileDBOType extends BaseFileDBOType {
type: 'document' | 'attachment' | 'custom';
totalPages: number;
language: string;
}

export type FileDBOType = DocumentFileDBOType;
48 changes: 48 additions & 0 deletions app/api/files.v2/database/specs/MongoSegmentationBuilder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { ObjectId } from 'mongodb';
import { ParagraphSchema, SegmentationType } from 'shared/types/segmentationType';

type Props = SegmentationType;

export class MongoSegmentationBuilder {
private constructor(private props: Props) {}

static create() {
return new MongoSegmentationBuilder({
_id: new ObjectId(),
status: 'ready',
segmentation: {
page_height: 0,
page_width: 0,
paragraphs: [],
},
});
}

withId(id: ObjectId) {
this.props._id = id;

return this;
}

withParagraph(paragraph: ParagraphSchema) {
this.props.segmentation?.paragraphs?.push(paragraph);

return this;
}

withStatus(status: Props['status']) {
this.props.status = status;

return this;
}

withFileId(id: ObjectId) {
this.props.fileID = id;

return this;
}

build(): SegmentationType {
return { ...this.props };
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ describe('S3FileStorage', () => {
describe('getPath', () => {
it.each([
{
file: new Document('id', 'entity', 1, 'document'),
file: new Document('id', 'entity', 1, 'document', 'ab'),
expected: 'test-tenant/documents/document',
},
{
Expand Down
Loading
Loading