-
Notifications
You must be signed in to change notification settings - Fork 26
/
getSentenceData.m
80 lines (64 loc) · 3.44 KB
/
getSentenceData.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
function sentenceData = getSentenceData(fn)
%GETSENTENCEDATA Flickr30k Entities sentence parser
% This function returns an array structure containing the
% information stored in the txt files for the Flickr30k Entities
% dataset
%
% Input:
% fn - path_to_file/<image id>.txt
%
% Outputs:
% sentenceData - array of structs (one for each sentence)
% with the following fields:
% sentence - the entire image caption
% phrases - a cell array containing the tokenized
% annotated phrases
% phraseFirstWordIdx - the sentence position of the
% first word of the annotated
% phrase
% phraseID - the annotation id of the phrase
% phraseType - any course categories associated with the
% phrase
if ~exist(fn,'file')
error('file not found');
end
% Read in the file and separate by sentence
fileID = fopen(fn,'r');
C = textscan(fileID,'%s','Delimiter','\n');
fclose(fileID);
% Get tokens of all annotated phrases, using brackets as grouping tokens
annotatedPhrases = cellfun(@(f)regexp(f,'\[(.*?)\]','tokens'),C{1},'UniformOutput',false);
% For each annotated phrase, the first word contains the annotation information
[entityInfo,annotatedPhrases] = cellfun(@strtok,annotatedPhrases,'UniformOutput',false);
% Tokenize the annotated phrases and get the annotation info
annotatedPhrases = cellfun(@(f)cellfun(@(g)textscan(g{1},'%s'),f),annotatedPhrases,'UniformOutput',false);
[entityID,entityType] = parseEntityInfo(entityInfo);
% Remove annotation information from the sentences,leave only the
% forward bracket to identify where annotation phrases begin
sentences = cellfun(@(f)strrep(f,']',''),C{1},'UniformOutput',false);
sentences = cellfun(@(f)regexprep(f,'\[\S* ','['),sentences,'UniformOutput',false);
% Tokenize the sentences
C = cellfun(@(f)textscan(f,'%s'),sentences,'UniformOutput',false);
% Get the word number where each of the annotated phrases begins
startPosition = cellfun(@(f)find(cellfun(@(g)~isempty(strfind(g,'[')),f{1})),C,'UniformOutput',false);
% Remove forward brackets and format the data for output
sentences = cellfun(@(f)strrep(f,'[',''),sentences,'UniformOutput',false);
sentenceData = struct('sentence',sentences,'phrases',annotatedPhrases,'phraseFirstWordIdx',startPosition,'phraseID',entityID,'phraseType',entityType);
end
function [entityID,entityType] = parseEntityInfo(entityInfo)
% Tokenize the annotation information, using forward slash to
% separate info (skip the [/EN# beginning each annotation)
C = cellfun(@(g)cellfun(@(f)textscan(f{1}(5:end),'%s','Delimiter','/'),g),entityInfo,'UniformOutput',false);
% Find sentences with annotations
noVal = cellfun(@(f)~isempty(f),C);
C = C(noVal);
% The first peice of information contains the identifier for the xml file
eID = cellfun(@(f)cellfun(@(g)g{1},f,'UniformOutput',false),C,'UniformOutput',false);
% Rest of the information says what entity type a phrase is
eType = cellfun(@(f)cellfun(@(g)g(2:end),f,'UniformOutput',false),C,'UniformOutput',false);
% Format the data, leaving an empty cell for sentences that don't have annotations
entityID = cell(length(noVal),1);
entityType = cell(length(noVal),1);
entityID(noVal) = eID;
entityType(noVal) = eType;
end