This repository has been archived by the owner on Sep 7, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdata.py
100 lines (85 loc) · 3.71 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Copyright 2016 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Classes and helpers for summary data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class SummaryExample(object):
"""Features for one article/summary paired example.
Attributes:
ex_id: integer ID for example.
article_sentences: list of string token lists for article sentences.
article_edu_ids: list of integer IDs for the containing EDU (elementary
discourse unit) for each token.
article_parent_ids: list of integer IDs for the parent EDU for each token
according to the discourse parse.
extract_labels: list of binary integers for each token indicating whether
the token is in the oracle extraction.
abstract_sentences: list of string token lists for abstract sentences.
"""
def __init__(self, ex_id, article_sentences, article_edu_ids,
article_parent_ids, extract_labels, abstract_sentences):
self.ex_id = ex_id
self.article_sentences = article_sentences
self.extract_labels = extract_labels
self.abstract_sentences = abstract_sentences
self.article_edu_ids = article_edu_ids
self.article_parent_ids = article_parent_ids
class SummaryBatch(object):
"""Model feeds for batch for extractive inference.
Attributes:
feeds: Feed dictionary mapping model placeholders to numpy tensors.
"""
def __init__(self, hps, model_inputs, examples, vocab):
"""Constructor for SummaryBatch.
Arguments:
hps: bag of hyperparameters for model.
model_inputs: model placeholders for input tensors.
examples: list of SummaryExample.
vocab: Vocabulary object.
"""
self.feeds = feeds = {}
feeds[model_inputs.article] = article = np.zeros(
[hps.batch_size, hps.num_art_steps], dtype=np.int64)
feeds[model_inputs.article_len] = article_len = np.zeros(
[hps.batch_size], dtype=np.int64)
feeds[model_inputs.article_extract_label] = extract_label = np.zeros(
[hps.batch_size, hps.num_art_steps], dtype=np.int64)
feeds[model_inputs.abstract_bag] = abstract_bag = np.zeros(
[hps.batch_size, hps.vocab_size], dtype=np.int64)
feeds[model_inputs.abstract_len] = abstract_len = np.zeros(
[hps.batch_size], dtype=np.int64)
i = 0
for nyex in examples:
article_sentence_tokens = [
tok.lower() for sent in nyex.article_sentences for tok in sent
]
article_sentence_tokens = article_sentence_tokens[:hps.num_art_steps]
extract_labels = [lab for sent in nyex.extract_labels for lab in sent]
extract_labels = extract_labels[:hps.num_art_steps]
abstract_sentence_tokens = [
tok.lower() for sent in nyex.abstract_sentences for tok in sent
]
j = 0
for tok, lab in zip(article_sentence_tokens, extract_labels):
article[i, j] = vocab.word_indices[tok]
extract_label[i, j] = lab
j += 1
article_len[i] = j
for tok in abstract_sentence_tokens:
abstract_bag[i, vocab.word_indices[tok]] += 1
abstract_len[i] += 1
i += 1