From e09a66cd7c05c7f60d21ea12a12142cc759e31b9 Mon Sep 17 00:00:00 2001 From: Gregor Leban Date: Fri, 24 Aug 2018 13:46:24 +0200 Subject: [PATCH] ## [v8.4]() (2018-08-24) **Added** - added `EventRegistry.getUsageInfo()` method, which returns the number of used tokens and the total number of available tokens for the given user. The existing methods `EventRegisty.getRemainingAvailableRequests()` and `EventRegistry.getDailyAvailableRequests()` are still there, but their value is only valid after making at least one request. - added searching of articles and events based on article authors. You can now provide `authorUri` parameter when creating the `QueryArticles` and `QueryEvents` instances. - added author related methods to `EventRegistry` class: `EventRegistry.suggestAuthors()` to obtain uris of authors for given (partial) name and `EventRegistry.getAuthorUri()` to obtain a single author uri for the given (partial) name. - added ability to search articles and events by authors. `QueryArticles` and `QueryEvents` constructors now also accept `authorUri` parameter that can be used to limit the results to articles/events by those authors. Use `QueryOper.AND()` or `QueryOper.OR()` to specify multiple authors in the same query. - BETA: added a filter for returning only articles that are written by sources that have a certain ranking. The filter can be specified by setting the parameters `startSourceRankPercentile` and `endSourceRankPercentile` when creating the `QueryArticles` instance. The default value for `startSourceRankPercentile` is 0 and for `endSourceRankPercentile` is 100. The values that can be set are not any value between 0 and 100 but has to be a number divisible by 10. By setting `startSourceRankPercentile` to 0 and `endSourceRankPercentile` to 20 you would get only articles from top ranked news sources (according to [Alexa site ranking](https://www.alexa.com/siteinfo)) that would amount to about *approximately 20%* of all matching content. Note: 20 percentiles do not represent 20% of all top sources. The value is used to identify the subset of news sources that generate approximately 20% of our collected news content. The reason for this choice is that top ranked 10% of news sources writes about 30% of all news content and our choice normalizes this effect. This feature could potentially change in the future. - `QueryEventArticlesIter` is now able to return only a subset of articles assigned to an event. You can use the same filters as with the `QueryArticles` constructor and you can specify them when constructing the instance of `QueryEventArticlesIter`. The same kind of filtering is also possible if you want to use the `RequestEventArticles()` class instead. - added some parameters and changed default values in some of the result types to reflect the backend changes. - added optional parameter `proxyUrl` to `Analytics.extractArticleInfo()`. It can be used to download article info through a proxy that you provide (to avoid potential GDPR issues). The `proxyUrl` should be in format `{schema}://{username}:{pass}@{proxy url/ip}`. --- CHANGELOG.md | 12 + eventregistry/Analytics.py | 9 +- eventregistry/Base.py | 46 ++-- eventregistry/EventRegistry.py | 56 ++++- eventregistry/Query.py | 18 +- eventregistry/QueryArticles.py | 70 +++++- eventregistry/QueryEvent.py | 266 ++++++++++++++++++++--- eventregistry/QueryEvents.py | 34 ++- eventregistry/ReturnInfo.py | 3 + eventregistry/tests/TestQueryArticles.py | 22 ++ eventregistry/tests/TestQueryEvent.py | 34 ++- eventregistry/tests/TestQueryEvents.py | 21 +- 12 files changed, 486 insertions(+), 105 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7466d4d..74fa492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Change Log +## [v8.4]() (2018-08-24) + +**Added** +- added `EventRegistry.getUsageInfo()` method, which returns the number of used tokens and the total number of available tokens for the given user. The existing methods `EventRegisty.getRemainingAvailableRequests()` and `EventRegistry.getDailyAvailableRequests()` are still there, but their value is only valid after making at least one request. +- added searching of articles and events based on article authors. You can now provide `authorUri` parameter when creating the `QueryArticles` and `QueryEvents` instances. +- added author related methods to `EventRegistry` class: `EventRegistry.suggestAuthors()` to obtain uris of authors for given (partial) name and `EventRegistry.getAuthorUri()` to obtain a single author uri for the given (partial) name. +- added ability to search articles and events by authors. `QueryArticles` and `QueryEvents` constructors now also accept `authorUri` parameter that can be used to limit the results to articles/events by those authors. Use `QueryOper.AND()` or `QueryOper.OR()` to specify multiple authors in the same query. +- BETA: added a filter for returning only articles that are written by sources that have a certain ranking. The filter can be specified by setting the parameters `startSourceRankPercentile` and `endSourceRankPercentile` when creating the `QueryArticles` instance. The default value for `startSourceRankPercentile` is 0 and for `endSourceRankPercentile` is 100. The values that can be set are not any value between 0 and 100 but has to be a number divisible by 10. By setting `startSourceRankPercentile` to 0 and `endSourceRankPercentile` to 20 you would get only articles from top ranked news sources (according to [Alexa site ranking](https://www.alexa.com/siteinfo)) that would amount to about *approximately 20%* of all matching content. Note: 20 percentiles do not represent 20% of all top sources. The value is used to identify the subset of news sources that generate approximately 20% of our collected news content. The reason for this choice is that top ranked 10% of news sources writes about 30% of all news content and our choice normalizes this effect. This feature could potentially change in the future. +- `QueryEventArticlesIter` is now able to return only a subset of articles assigned to an event. You can use the same filters as with the `QueryArticles` constructor and you can specify them when constructing the instance of `QueryEventArticlesIter`. The same kind of filtering is also possible if you want to use the `RequestEventArticles()` class instead. +- added some parameters and changed default values in some of the result types to reflect the backend changes. +- added optional parameter `proxyUrl` to `Analytics.extractArticleInfo()`. It can be used to download article info through a proxy that you provide (to avoid potential GDPR issues). The `proxyUrl` should be in format `{schema}://{username}:{pass}@{proxy url/ip}`. + ## [v8.3.1]() (2018-08-12) **Updated** diff --git a/eventregistry/Analytics.py b/eventregistry/Analytics.py index c39a5d1..1e7e06c 100644 --- a/eventregistry/Analytics.py +++ b/eventregistry/Analytics.py @@ -75,13 +75,18 @@ def detectLanguage(self, text): return self._er.jsonRequestAnalytics("/api/v1/detectLanguage", { "text": text }) - def extractArticleInfo(self, url): + def extractArticleInfo(self, url, proxyUrl = None): """ extract all available information about an article available at url `url`. Returned information will include article title, body, authors, links in the articles, ... + @param url: article url to extract article information from + @param proxyUrl: proxy that should be used for downloading article information. format: {schema}://{username}:{pass}@{proxy url/ip} @returns: dict """ - return self._er.jsonRequestAnalytics("/api/v1/extractArticleInfo", { "url": url }) + params = { "url": url } + if proxyUrl: + params["proxyUrl"] = proxyUrl + return self._er.jsonRequestAnalytics("/api/v1/extractArticleInfo", params) def ner(self, text): diff --git a/eventregistry/Base.py b/eventregistry/Base.py index 8b6d383..fde19c1 100644 --- a/eventregistry/Base.py +++ b/eventregistry/Base.py @@ -196,27 +196,6 @@ def _getQueryParams(self): return dict(self.queryParams) - -class Query(QueryParamsBase): - def __init__(self): - QueryParamsBase.__init__(self) - self.resultTypeList = [] - - - def _getQueryParams(self): - """encode the request.""" - allParams = {} - if len(self.resultTypeList) == 0: - raise ValueError("The query does not have any result type specified. No sense in performing such a query") - allParams.update(self.queryParams) - for request in self.resultTypeList: - allParams.update(request.__dict__) - # all requests in resultTypeList have "resultType" so each call to .update() overrides the previous one - # since we want to store them all we have to add them here: - allParams["resultType"] = [request.__dict__["resultType"] for request in self.resultTypeList] - return allParams - - def _setQueryArrVal(self, value, propName, propOperName, defaultOperName): """ parse the value "value" and use it to set the property propName and the operator with name propOperName @@ -251,4 +230,27 @@ def _setQueryArrVal(self, value, propName, propOperName, defaultOperName): # there should be no other valid types else: - assert False, "Parameter '%s' was of unsupported type. It should either be None, a string or an instance of QueryItems" % (propName) \ No newline at end of file + assert False, "Parameter '%s' was of unsupported type. It should either be None, a string or an instance of QueryItems" % (propName) + + + +class Query(QueryParamsBase): + def __init__(self): + QueryParamsBase.__init__(self) + self.resultTypeList = [] + + + def _getQueryParams(self): + """encode the request.""" + allParams = {} + if len(self.resultTypeList) == 0: + raise ValueError("The query does not have any result type specified. No sense in performing such a query") + allParams.update(self.queryParams) + for request in self.resultTypeList: + allParams.update(request.__dict__) + # all requests in resultTypeList have "resultType" so each call to .update() overrides the previous one + # since we want to store them all we have to add them here: + allParams["resultType"] = [request.__dict__["resultType"] for request in self.resultTypeList] + return allParams + + diff --git a/eventregistry/EventRegistry.py b/eventregistry/EventRegistry.py index d781ed5..dc042fc 100644 --- a/eventregistry/EventRegistry.py +++ b/eventregistry/EventRegistry.py @@ -141,15 +141,20 @@ def printConsole(self, text): def getRemainingAvailableRequests(self): - """get the number of requests that are still available for the user today""" + """get the number of requests that are still available for the user today. Information is only accessible after you make some query.""" return self._remainingAvailableRequests def getDailyAvailableRequests(self): - """get the total number of requests that the user can make in a day""" + """get the total number of requests that the user can make in a day. Information is only accessible after you make some query.""" return self._dailyAvailableRequests + def getUsageInfo(self): + """return the number of used and total available tokens. Can be used at any time (also before making queries)""" + return self.jsonRequest("/api/v1/usage", { "apiKey": self._apiKey }) + + def getUrl(self, query): """ return the url that can be used to get the content that matches the query @@ -349,7 +354,7 @@ def suggestConcepts(self, prefix, sources = ["concepts"], lang = "eng", conceptL params = { "prefix": prefix, "source": sources, "lang": lang, "conceptLang": conceptLang, "page": page, "count": count} params.update(returnInfo.getParams()) params.update(kwargs) - return self.jsonRequest("/json/suggestConcepts", params) + return self.jsonRequest("/json/suggestConceptsFast", params) def suggestCategories(self, prefix, page = 1, count = 20, returnInfo = ReturnInfo(), **kwargs): @@ -364,7 +369,7 @@ def suggestCategories(self, prefix, page = 1, count = 20, returnInfo = ReturnInf params = { "prefix": prefix, "page": page, "count": count } params.update(returnInfo.getParams()) params.update(kwargs) - return self.jsonRequest("/json/suggestCategories", params) + return self.jsonRequest("/json/suggestCategoriesFast", params) def suggestNewsSources(self, prefix, dataType = ["news", "pr", "blog"], page = 1, count = 20, **kwargs): @@ -378,7 +383,7 @@ def suggestNewsSources(self, prefix, dataType = ["news", "pr", "blog"], page = 1 assert page > 0, "page parameter should be above 0" params = {"prefix": prefix, "dataType": dataType, "page": page, "count": count} params.update(kwargs) - return self.jsonRequest("/json/suggestSources", params) + return self.jsonRequest("/json/suggestSourcesFast", params) def suggestSourceGroups(self, prefix, page = 1, count = 20, **kwargs): @@ -413,7 +418,7 @@ def suggestLocations(self, prefix, sources = ["place", "country"], lang = "eng", assert len(sortByDistanceTo) == 2, "The sortByDistanceTo should contain two float numbers" params["closeToLat"] = sortByDistanceTo[0] params["closeToLon"] = sortByDistanceTo[1] - return self.jsonRequest("/json/suggestLocations", params) + return self.jsonRequest("/json/suggestLocationsFast", params) def suggestLocationsAtCoordinate(self, latitude, longitude, radiusKm, limitToCities = False, lang = "eng", count = 20, ignoreNonWiki = True, returnInfo = ReturnInfo(), **kwargs): @@ -433,7 +438,7 @@ def suggestLocationsAtCoordinate(self, latitude, longitude, radiusKm, limitToCit params = { "action": "getLocationsAtCoordinate", "lat": latitude, "lon": longitude, "radius": radiusKm, "limitToCities": limitToCities, "count": count, "lang": lang } params.update(returnInfo.getParams()) params.update(kwargs) - return self.jsonRequest("/json/suggestLocations", params) + return self.jsonRequest("/json/suggestLocationsFast", params) def suggestSourcesAtCoordinate(self, latitude, longitude, radiusKm, count = 20, **kwargs): @@ -448,7 +453,7 @@ def suggestSourcesAtCoordinate(self, latitude, longitude, radiusKm, count = 20, assert isinstance(longitude, (int, float)), "The 'longitude' should be a number" params = {"action": "getSourcesAtCoordinate", "lat": latitude, "lon": longitude, "radius": radiusKm, "count": count} params.update(kwargs) - return self.jsonRequest("/json/suggestSources", params) + return self.jsonRequest("/json/suggestSourcesFast", params) def suggestSourcesAtPlace(self, conceptUri, dataType = "news", page = 1, count = 20, **kwargs): @@ -461,7 +466,21 @@ def suggestSourcesAtPlace(self, conceptUri, dataType = "news", page = 1, count = """ params = {"action": "getSourcesAtPlace", "conceptUri": conceptUri, "page": page, "count": count, "dataType": dataType} params.update(kwargs) - return self.jsonRequest("/json/suggestSources", params) + return self.jsonRequest("/json/suggestSourcesFast", params) + + + def suggestAuthors(self, prefix, page = 1, count = 20, **kwargs): + """ + return a list of news sources that match the prefix + @param prefix: input text that should be contained in the author name and source url + @param page: page of results + @param count: number of returned suggestions + """ + assert page > 0, "page parameter should be above 0" + params = {"prefix": prefix, "page": page, "count": count} + params.update(kwargs) + return self.jsonRequest("/json/suggestAuthorsFast", params) + def suggestConceptClasses(self, prefix, lang = "eng", conceptLang = "eng", source = ["dbpedia", "custom"], page = 1, count = 20, returnInfo = ReturnInfo(), **kwargs): @@ -552,6 +571,13 @@ def getNewsSourceUri(self, sourceName, dataType = ["news", "pr", "blog"]): return None + def getSourceUri(self, sourceName, dataType=["news", "pr", "blog"]): + """ + alternative (shorter) name for the method getNewsSourceUri() + """ + return self.getNewsSourceUri(sourceName, dataType) + + def getSourceGroupUri(self, sourceGroupName): """ return the URI of the source group that best matches the name @@ -600,6 +626,18 @@ def getCustomConceptUri(self, label, lang = "eng"): return None + def getAuthorUri(self, authorName): + """ + return author uri that that is the best match for the given author name (and potentially source url) + if there are multiple matches for the given author name, they are sorted based on the number of articles they have written (from most to least frequent) + @param authorName: partial or full name of the author, potentially also containing the source url (e.g. "george brown nytimes") + """ + matches = self.suggestAuthors(authorName) + if matches != None and isinstance(matches, list) and len(matches) > 0 and "uri" in matches[0]: + return matches[0]["uri"] + return None + + @staticmethod def getUriFromUriWgt(uriWgtList): """ diff --git a/eventregistry/Query.py b/eventregistry/Query.py index bec0d4f..1be97b5 100644 --- a/eventregistry/Query.py +++ b/eventregistry/Query.py @@ -33,22 +33,24 @@ def __init__(self, dateEnd = None, dateMention = None, sourceLocationUri = None, - sourceGroupUri = None, + sourceGroupUri=None, + authorUri = None, keywordLoc = "body", minMaxArticlesInEvent = None, exclude = None): """ - @param keyword: keyword(s) to query. Either None, string or QueryItems - @param conceptUri: concept(s) to query. Either None, string or QueryItems - @param sourceUri: source(s) to query. Either None, string or QueryItems - @param locationUri: location(s) to query. Either None, string or QueryItems - @param categoryUri: categories to query. Either None, string or QueryItems - @param lang: language(s) to query. Either None, string or QueryItems + @param keyword: keyword(s) to query. Either None, string or QueryItems instance + @param conceptUri: concept(s) to query. Either None, string or QueryItems instance + @param sourceUri: source(s) to query. Either None, string or QueryItems instance + @param locationUri: location(s) to query. Either None, string or QueryItems instance + @param categoryUri: categories to query. Either None, string or QueryItems instance + @param lang: language(s) to query. Either None, string or QueryItems instance @param dateStart: starting date. Either None, string or date or datetime @param dateEnd: ending date. Either None, string or date or datetime @param dateMention: search by mentioned dates - Either None, string or date or datetime or a list of these types @param sourceLocationUri: find content generated by news sources at the specified geographic location - can be a city URI or a country URI. Multiple items can be provided using a list @param sourceGroupUri: a single or multiple source group URIs. A source group is a group of news sources, commonly defined based on common topic or importance + @param authorUri: author(s) to query. Either None, string or QueryItems instance @param keywordLoc: where should we look when searching using the keywords provided by "keyword" parameter. "body" (default), "title", or "body,title" @param minMaxArticlesInEvent: a tuple containing the minimum and maximum number of articles that should be in the resulting events. Parameter relevant only if querying events @param exclude: a instance of BaseQuery, CombinedQuery or None. Used to filter out results matching the other criteria specified in this query @@ -78,6 +80,8 @@ def __init__(self, self._setQueryArrVal("sourceLocationUri", sourceLocationUri) self._setQueryArrVal("sourceGroupUri", sourceGroupUri) + self._setQueryArrVal("authorUri", authorUri) + if keywordLoc != "body": self._queryObj["keywordLoc"] = keywordLoc diff --git a/eventregistry/QueryArticles.py b/eventregistry/QueryArticles.py index 0dc9fa4..f7abf16 100644 --- a/eventregistry/QueryArticles.py +++ b/eventregistry/QueryArticles.py @@ -12,6 +12,7 @@ def __init__(self, sourceUri = None, sourceLocationUri = None, sourceGroupUri = None, + authorUri = None, locationUri = None, lang = None, dateStart = None, @@ -26,13 +27,16 @@ def __init__(self, ignoreSourceUri = None, ignoreSourceLocationUri = None, ignoreSourceGroupUri = None, + ignoreAuthorUri = None, ignoreLocationUri = None, ignoreLang = None, - ignoreKeywordsLoc = "body", + isDuplicateFilter = "keepAll", hasDuplicateFilter = "keepAll", - eventFilter="keepAll", + eventFilter = "keepAll", + startSourceRankPercentile = 0, + endSourceRankPercentile = 100, dataType = "news", requestedResult = None): """ @@ -60,6 +64,9 @@ def __init__(self, @param sourceGroupUri: find articles that were written by news sources that are assigned to the specified source group. If multiple source groups are provided, then put them into a list inside QueryItems.OR() Source group uri for a given name can be obtained using EventRegistry.getSourceGroupUri(). + @param authorUri: find articles that were written by a specific author. + If multiple authors should be considered use QueryItems.OR() to provide the list of authors. + Author uri for a given author name can be obtained using EventRegistry.getAuthorUri(). @param locationUri: find articles that describe something that occured at a particular location. If value can be a string or a list of strings provided in QueryItems.OR(). Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). @@ -77,6 +84,7 @@ def __init__(self, @param ignoreSourceUri: ignore articles that have been written by *any* of the specified news sources @param ignoreSourceLocationUri: ignore articles that have been written by sources located at *any* of the specified locations @param ignoreSourceGroupUri: ignore articles that have been written by sources in *any* of the specified source groups + @param ignoreAuthorUri: ignore articles that were written by *any* of the specified authors @param ignoreLocationUri: ignore articles that occured in any of the provided locations. A location can be a city or a place @param ignoreLang: ignore articles that are written in *any* of the provided languages @param ignoreKeywordsLoc: where should we look when data should be used when searching using the keywords provided by "ignoreKeywords" parameter. "body" (default), "title", or "body,title" @@ -93,6 +101,8 @@ def __init__(self, "skipArticlesWithoutEvent" (skip articles that are not describing any known event in ER) "keepOnlyArticlesWithoutEvent" (return only the articles that are not describing any known event in ER) "keepAll" (no filtering, default) + @param startSourceRankPercentile: starting percentile of the sources to consider in the results (default: 0). Value should be in range 0-90 and divisible by 10. + @param endSourceRankPercentile: ending percentile of the sources to consider in the results (default: 100). Value should be in range 10-100 and divisible by 10. @param dataType: what data types should we search? "news" (news content, default), "pr" (press releases), or "blog". If you want to use multiple data types, put them in an array (e.g. ["news", "pr"]) @param requestedResult: the information to return as the result of the query. By default return the list of matching articles @@ -106,6 +116,7 @@ def __init__(self, self._setQueryArrVal(sourceUri, "sourceUri", "sourceOper", "or") self._setQueryArrVal(sourceLocationUri, "sourceLocationUri", None, "or") self._setQueryArrVal(sourceGroupUri, "sourceGroupUri", "sourceGroupOper", "or") + self._setQueryArrVal(authorUri, "authorUri", "authorOper", "or") self._setQueryArrVal(locationUri, "locationUri", None, "or") # location such as "http://en.wikipedia.org/wiki/Ljubljana" self._setQueryArrVal(lang, "lang", None, "or") # a single lang or list (possible: eng, deu, spa, zho, slv) @@ -131,6 +142,7 @@ def __init__(self, self._setQueryArrVal(ignoreSourceUri, "ignoreSourceUri", None, "or") self._setQueryArrVal(ignoreSourceLocationUri, "ignoreSourceLocationUri", None, "or") self._setQueryArrVal(ignoreSourceGroupUri, "ignoreSourceGroupUri", None, "or") + self._setQueryArrVal(ignoreAuthorUri, "ignoreAuthorUri", None, "or") self._setQueryArrVal(ignoreLocationUri, "ignoreLocationUri", None, "or") self._setQueryArrVal(ignoreLang, "ignoreLang", None, "or") @@ -141,6 +153,13 @@ def __init__(self, self._setValIfNotDefault("isDuplicateFilter", isDuplicateFilter, "keepAll") self._setValIfNotDefault("hasDuplicateFilter", hasDuplicateFilter, "keepAll") self._setValIfNotDefault("eventFilter", eventFilter, "keepAll") + assert startSourceRankPercentile >= 0 and startSourceRankPercentile % 10 == 0 and startSourceRankPercentile <= 100 + assert endSourceRankPercentile >= 0 and endSourceRankPercentile % 10 == 0 and endSourceRankPercentile <= 100 + assert startSourceRankPercentile < endSourceRankPercentile + if startSourceRankPercentile != 0: + self._setVal("startSourceRankPercentile", startSourceRankPercentile) + if endSourceRankPercentile != 100: + self._setVal("endSourceRankPercentile", endSourceRankPercentile) # always set the data type self._setVal("dataType", dataType) @@ -407,12 +426,20 @@ def __init__(self): class RequestArticlesConceptAggr(RequestArticles): def __init__(self, - conceptCount = 25, + conceptCount=25, + conceptCountPerType = None, + conceptScoring = "importance", articlesSampleSize = 10000, returnInfo = ReturnInfo()): """ get aggreate of concepts of resulting articles @param conceptCount: number of top concepts to return (at most 500) + @param conceptCountPerType: if you wish to limit the number of top concepts per type (person, org, loc, wiki) then set this to some number. + If you want to get equal number of concepts for each type then set conceptCountPerType to conceptCount/4 (since there are 4 concept types) + @param conceptScoring: how should the top concepts be computed. Possible values are + "importance" (takes into account how frequently a concept is mentioned and how relevant it is in an article), + "frequency" (ranks the concepts simply by how frequently the concept is mentioned in the results) and + "uniqueness" (computes what are the top concepts that are frequently mentioned in the results of your search query but less frequently mentioned in the news in general) @param articlesSampleSize: on what sample of results should the aggregate be computed (at most 20000) @param returnInfo: what details about the concepts should be included in the returned information """ @@ -421,6 +448,9 @@ def __init__(self, self.resultType = "conceptAggr" self.conceptAggrConceptCount = conceptCount self.conceptAggrSampleSize = articlesSampleSize + self.conceptAggrScoring = conceptScoring + if conceptCountPerType != None: + self.conceptAggrConceptCountPerType = conceptCountPerType self.__dict__.update(returnInfo.getParams("conceptAggr")) @@ -444,30 +474,35 @@ def __init__(self, class RequestArticlesSourceAggr(RequestArticles): def __init__(self, articlesSampleSize = 20000, + sourceCount = 50, + normalizeBySourceArts = False, returnInfo = ReturnInfo()): """ get aggreate of news sources of resulting articles @param articlesSampleSize: on what sample of results should the aggregate be computed (at most 1000000) + @param sourceCount: the number of top sources to return + @param normalizeBySourceArts: some sources generate significantly more content than others which is why + they can appear as top souce for a given query. If you want to normalize and sort the sources by the total number of + articles that they have published set this to True. This will return as top sources those that potentially publish less + content overall, but their published content is more about the searched query. @param returnInfo: what details about the sources should be included in the returned information """ assert articlesSampleSize <= 1000000 self.resultType = "sourceAggr" + self.sourceAggrSourceCount = sourceCount self.sourceAggrSampleSize = articlesSampleSize self.__dict__.update(returnInfo.getParams("sourceAggr")) class RequestArticlesKeywordAggr(RequestArticles): def __init__(self, - lang = "eng", articlesSampleSize = 2000): """ get top keywords in the resulting articles - @param lang: articles in which language should be analyzed and processed @param articlesSampleSize: on what sample of results should the aggregate be computed (at most 20000) """ assert articlesSampleSize <= 20000 self.resultType = "keywordAggr" - self.keywordAggrLang = lang self.keywordAggrSampleSize = articlesSampleSize @@ -477,6 +512,7 @@ def __init__(self, conceptCount = 25, linkCount = 50, articlesSampleSize = 10000, + skipQueryConcepts = True, returnInfo = ReturnInfo()): """ get concept graph of resulting articles. Identify concepts that frequently co-occur with other concepts @@ -492,6 +528,7 @@ def __init__(self, self.conceptGraphConceptCount = conceptCount self.conceptGraphLinkCount = linkCount self.conceptGraphSampleSize = articlesSampleSize + self.conceptGraphSkipQueryConcepts = skipQueryConcepts self.__dict__.update(returnInfo.getParams("conceptGraph")) @@ -522,22 +559,22 @@ def __init__(self, class RequestArticlesConceptTrends(RequestArticles): def __init__(self, conceptUris = None, - count = 25, + conceptCount = 25, articlesSampleSize=10000, returnInfo = ReturnInfo()): """ get trending of concepts in the resulting articles @param conceptUris: list of concept URIs for which to return trending information. If None, then top concepts will be automatically computed - @param count: if the concepts are not provided, what should be the number of automatically determined concepts to return (at most 50) + @param conceptCount: if the concepts are not provided, what should be the number of automatically determined concepts to return (at most 50) @param articlesSampleSize: on what sample of results should the aggregate be computed (at most 50000) @param returnInfo: what details should be included in the returned information """ - assert count <= 50 + assert conceptCount <= 50 assert articlesSampleSize <= 50000 self.resultType = "conceptTrends" if conceptUris != None: self.conceptTrendsConceptUri = conceptUris - self.conceptTrendsConceptCount = count + self.conceptTrendsConceptCount = conceptCount self.conceptTrendsSampleSize = articlesSampleSize self.__dict__.update(returnInfo.getParams("conceptTrends")) @@ -557,27 +594,38 @@ def __init__(self, maxArticleCount = 100, updatesAfterTm = None, updatesAfterMinsAgo = None, + updatesUntilTm = None, + updatesUntilMinsAgo = None, lang = None, mandatorySourceLocation = False, returnInfo = ReturnInfo()): """ get the list of articles that were recently added to the Event Registry and match the selected criteria - @param maxArticleCount: max articles to return (at most 500) + @param maxArticleCount: the maximum number of articles to return in the call (the number can be even higher than 100 but in case more articles + are returned, the call will also use more tokens) @param updatesAfterTm: the time after which the articles were added (returned by previous call to the same method) @param updatesAfterMinsAgo: how many minutes into the past should we check (set either this or updatesAfterTm property, but not both) + @param updatesUntilTm: what is the latest time when the articles were added (in case you don't want the most recent articles) + @param updatesUntilMinsAgo: how many minutes ago was the latest time when the articles were added @param lang: return only articles in the specified languages (None if no limits). accepts string or a list of strings @param mandatorySourceLocation: return only articles for which we know the source's geographic location @param returnInfo: what details should be included in the returned information """ assert maxArticleCount <= 1000 assert updatesAfterTm == None or updatesAfterMinsAgo == None, "You should specify either updatesAfterTm or updatesAfterMinsAgo parameter, but not both" + assert updatesUntilTm == None or updatesUntilMinsAgo == None, "You should specify either updatesUntilTm or updatesUntilMinsAgo parameter, but not both" self.resultType = "recentActivityArticles" self.recentActivityArticlesMaxArticleCount = maxArticleCount if updatesAfterTm != None: self.recentActivityArticlesUpdatesAfterTm = QueryParamsBase.encodeDateTime(updatesAfterTm) if updatesAfterMinsAgo != None: self.recentActivityArticlesUpdatesAfterMinsAgo = updatesAfterMinsAgo + if updatesUntilTm != None: + self.recentActivityArticlesUpdatesUntilTm = QueryParamsBase.encodeDateTime(updatesUntilTm) + if updatesUntilMinsAgo != None: + self.recentActivityArticlesUpdatesUntilMinsAgo = updatesUntilMinsAgo if lang != None: self.recentActivityArticlesLang = lang + self.recentActivityArticlesMaxArticleCount = maxArticleCount self.recentActivityArticlesMandatorySourceLocation = mandatorySourceLocation self.__dict__.update(returnInfo.getParams("recentActivityArticles")) \ No newline at end of file diff --git a/eventregistry/QueryEvent.py b/eventregistry/QueryEvent.py index 6be90a9..2b2aa11 100644 --- a/eventregistry/QueryEvent.py +++ b/eventregistry/QueryEvent.py @@ -2,6 +2,7 @@ from eventregistry.Base import * from eventregistry.ReturnInfo import * from eventregistry.QueryArticles import QueryArticles, RequestArticlesInfo +from eventregistry.Query import * class QueryEvent(Query): @@ -12,7 +13,7 @@ def __init__(self, eventUriOrList, requestedResult = None): """ - @param eventUriOrUriList: a single event uri or a list of event uris + @param eventUriOrUriList: a single event uri or a list of event uris (max 50) @param requestedResult: the information to return as the result of the query. By default return the details of the event """ super(QueryEvent, self).__init__() @@ -50,19 +51,107 @@ class QueryEventArticlesIter(QueryEvent, six.Iterator): """ Class for obtaining an iterator over all articles in the event """ - def __init__(self, eventUri): - """@param eventUri: a single event for which we want to obtain the list of articles in it""" + def __init__(self, eventUri, + lang = None, + keywords = None, + conceptUri = None, + categoryUri = None, + sourceUri = None, + sourceLocationUri = None, + sourceGroupUri = None, + authorUri = None, + locationUri = None, + dateStart = None, + dateEnd = None, + dateMentionStart = None, + dateMentionEnd=None, + keywordsLoc="body", + + startSourceRankPercentile = 0, + endSourceRankPercentile = 100): + """ + @param eventUri: a single event for which we want to obtain the list of articles in it + @param lang: find articles that are written in the specified language. + If more than one language is specified, resulting articles has to be written in *any* of the languages. + @param keywords: limit the event articles to those that mention the specified keywords. + A single keyword/phrase can be provided as a string, multiple keywords/phrases can be provided as a list of strings. + Use QueryItems.AND() if *all* provided keywords/phrases should be mentioned, or QueryItems.OR() if *any* of the keywords/phrases should be mentioned. + or QueryItems.OR() to specify a list of keywords where any of the keywords have to appear + @param conceptUri: limit the event articles to those where the concept with concept uri is mentioned. + A single concept uri can be provided as a string, multiple concept uris can be provided as a list of strings. + Use QueryItems.AND() if *all* provided concepts should be mentioned, or QueryItems.OR() if *any* of the concepts should be mentioned. + To obtain a concept uri using a concept label use EventRegistry.getConceptUri(). + @param categoryUri: limit the event articles to those that are assigned into a particular category. + A single category can be provided as a string, while multiple categories can be provided as a list in QueryItems.AND() or QueryItems.OR(). + A category uri can be obtained from a category name using EventRegistry.getCategoryUri(). + @param sourceUri: limit the event articles to those that were written by a news source sourceUri. + If multiple sources should be considered use QueryItems.OR() to provide the list of sources. + Source uri for a given news source name can be obtained using EventRegistry.getNewsSourceUri(). + @param sourceLocationUri: limit the event articles to those that were written by news sources located in the given geographic location. + If multiple source locations are provided, then put them into a list inside QueryItems.OR() + Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). + @param sourceGroupUri: limit the event articles to those that were written by news sources that are assigned to the specified source group. + If multiple source groups are provided, then put them into a list inside QueryItems.OR() + Source group uri for a given name can be obtained using EventRegistry.getSourceGroupUri(). + @param authorUri: find articles that were written by a specific author. + If multiple authors should be considered use QueryItems.OR() to provide the list of authors. + Author uri for a given author name can be obtained using EventRegistry.getAuthorUri(). + @param locationUri: find articles that describe something that occured at a particular location. + If value can be a string or a list of strings provided in QueryItems.OR(). + Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). + @param dateStart: find articles that were written on or after dateStart. Date should be provided in YYYY-MM-DD format, datetime.time or datetime.datetime. + @param dateEnd: find articles that occured before or on dateEnd. Date should be provided in YYYY-MM-DD format, datetime.time or datetime.datetime. + + @param dateMentionStart: limit the event articles to those that explicitly mention a date that is equal or greater than dateMentionStart. + @param dateMentionEnd: limit the event articles to those that explicitly mention a date that is lower or equal to dateMentionEnd. + @param keywordsLoc: where should we look when searching using the keywords provided by "keywords" parameter. "body" (default), "title", or "body,title" + + @param startSourceRankPercentile: starting percentile of the sources to consider in the results (default: 0). Value should be in range 0-90 and divisible by 10. + @param endSourceRankPercentile: ending percentile of the sources to consider in the results (default: 100). Value should be in range 10-100 and divisible by 10. + """ super(QueryEventArticlesIter, self).__init__(eventUri) - - - def count(self, eventRegistry, - lang = None): + self._setQueryArrVal(keywords, "keyword", "keywordOper", "and") + self._setQueryArrVal(conceptUri, "conceptUri", "conceptOper", "and") + self._setQueryArrVal(categoryUri, "categoryUri", "categoryOper", "or") + self._setQueryArrVal(sourceUri, "sourceUri", "sourceOper", "or") + self._setQueryArrVal(sourceLocationUri, "sourceLocationUri", None, "or") + self._setQueryArrVal(sourceGroupUri, "sourceGroupUri", "sourceGroupOper", "or") + self._setQueryArrVal(authorUri, "authorUri", "authorOper", "or") + self._setQueryArrVal(locationUri, "locationUri", None, "or") # location such as "http://en.wikipedia.org/wiki/Ljubljana" + + self._setQueryArrVal(lang, "lang", None, "or") # a single lang or list (possible: eng, deu, spa, zho, slv) + + # starting date of the published articles (e.g. 2014-05-02) + if dateStart != None: + self._setDateVal("dateStart", dateStart) + # ending date of the published articles (e.g. 2014-05-02) + if dateEnd != None: + self._setDateVal("dateEnd", dateEnd) + + # first valid mentioned date detected in articles (e.g. 2014-05-02) + if dateMentionStart != None: + self._setDateVal("dateMentionStart", dateMentionStart) + # last valid mentioned date detected in articles (e.g. 2014-05-02) + if dateMentionEnd != None: + self._setDateVal("dateMentionEnd", dateMentionEnd) + + self._setValIfNotDefault("keywordLoc", keywordsLoc, "body") + + assert startSourceRankPercentile >= 0 and startSourceRankPercentile % 10 == 0 and startSourceRankPercentile <= 100 + assert endSourceRankPercentile >= 0 and endSourceRankPercentile % 10 == 0 and endSourceRankPercentile <= 100 + assert startSourceRankPercentile < endSourceRankPercentile + if startSourceRankPercentile != 0: + self._setVal("startSourceRankPercentile", startSourceRankPercentile) + if endSourceRankPercentile != 100: + self._setVal("endSourceRankPercentile", endSourceRankPercentile) + + + def count(self, eventRegistry): """ return the number of articles that match the criteria @param eventRegistry: instance of EventRegistry class. used to obtain the necessary data - @param lang: array or a single language in which to return the list of matching articles. If None, then return articles in all languages """ - self.setRequestedResult(RequestEventArticles(lang = lang)) + self.setRequestedResult(RequestEventArticles(**self.queryParams)) res = eventRegistry.execQuery(self) if "error" in res: print(res["error"]) @@ -71,29 +160,28 @@ def count(self, eventRegistry, def execQuery(self, eventRegistry, - lang = None, sortBy = "cosSim", sortByAsc = False, returnInfo = ReturnInfo(articleInfo = ArticleInfoFlags(bodyLen = -1)), maxItems = -1): """ @param eventRegistry: instance of EventRegistry class. used to obtain the necessary data - @param lang: array or a single language in which to return the list of matching articles. If None, then return articles in all languages + @param sortBy: order in which event articles are sorted. Options: none (no specific sorting), id (internal id), date (published date), cosSim (closeness to event centroid), sourceImportance (manually curated score of source importance - high value, high importance), sourceImportanceRank (reverse of sourceImportance), sourceAlexaGlobalRank (global rank of the news source), sourceAlexaCountryRank (country rank of the news source), socialScore (total shares on social media), facebookShares (shares on Facebook only) @param sortByAsc: should the results be sorted in ascending order (True) or descending (False) @param returnInfo: what details should be included in the returned information @param maxItems: maximum number of items to be returned. Used to stop iteration sooner than results run out """ self._er = eventRegistry - self._lang = lang - self._sortBy = sortBy - self._sortByAsc = sortByAsc - self._returnInfo = returnInfo - self._articleBatchSize = 100 self._articlePage = 0 self._totalPages = None # if we want to return only a subset of items: self._maxItems = maxItems self._currItem = 0 + + self._articlesSortBy = sortBy + self._articlesSortByAsc = sortByAsc + self._returnInfo = returnInfo + # download the list of article uris self._articleList = [] return self @@ -109,9 +197,12 @@ def _getNextArticleBatch(self): return if self._er._verboseOutput: print("Downloading article page %d from event %s" % (self._articlePage, eventUri)) - self.setRequestedResult(RequestEventArticles(page=self._articlePage, count=self._articleBatchSize, - lang = self._lang, sortBy= self._sortBy, sortByAsc=self._sortByAsc, - returnInfo = self._returnInfo)) + + self.setRequestedResult(RequestEventArticles( + page = self._articlePage, + sortBy = self._articlesSortBy, sortByAsc = self._articlesSortByAsc, + returnInfo = self._returnInfo, + **self.queryParams)) res = self._er.execQuery(self) if "error" in res: print(res["error"]) @@ -160,61 +251,164 @@ def __init__(self, returnInfo = ReturnInfo()): -class RequestEventArticles(RequestEvent): +class RequestEventArticles(RequestEvent, QueryParamsBase): def __init__(self, - page = 1, - count = 100, - lang = None, - sortBy = "cosSim", sortByAsc = False, - returnInfo = ReturnInfo(articleInfo = ArticleInfoFlags(bodyLen = -1))): + page = 1, + count = 100, + + lang = None, + keywords = None, + conceptUri = None, + categoryUri = None, + sourceUri = None, + sourceLocationUri = None, + sourceGroupUri = None, + authorUri = None, + locationUri = None, + dateStart = None, + dateEnd = None, + dateMentionStart = None, + dateMentionEnd=None, + keywordsLoc="body", + + startSourceRankPercentile = 0, + endSourceRankPercentile = 100, + + sortBy = "cosSim", sortByAsc = False, + returnInfo=ReturnInfo(articleInfo=ArticleInfoFlags(bodyLen=-1)), + **kwds): """ return articles about the event @param page: page of the articles to return (1, 2, ...) @param count: number of articles to return per page (at most 100) - @param lang: a single lanugage or a list of languages in which to return the articles. None to return articles in all languages + + @param keywords: limit the event articles to those that mention the specified keywords. + A single keyword/phrase can be provided as a string, multiple keywords/phrases can be provided as a list of strings. + Use QueryItems.AND() if *all* provided keywords/phrases should be mentioned, or QueryItems.OR() if *any* of the keywords/phrases should be mentioned. + or QueryItems.OR() to specify a list of keywords where any of the keywords have to appear + @param conceptUri: limit the event articles to those where the concept with concept uri is mentioned. + A single concept uri can be provided as a string, multiple concept uris can be provided as a list of strings. + Use QueryItems.AND() if *all* provided concepts should be mentioned, or QueryItems.OR() if *any* of the concepts should be mentioned. + To obtain a concept uri using a concept label use EventRegistry.getConceptUri(). + @param categoryUri: limit the event articles to those that are assigned into a particular category. + A single category can be provided as a string, while multiple categories can be provided as a list in QueryItems.AND() or QueryItems.OR(). + A category uri can be obtained from a category name using EventRegistry.getCategoryUri(). + @param sourceUri: limit the event articles to those that were written by a news source sourceUri. + If multiple sources should be considered use QueryItems.OR() to provide the list of sources. + Source uri for a given news source name can be obtained using EventRegistry.getNewsSourceUri(). + @param sourceLocationUri: limit the event articles to those that were written by news sources located in the given geographic location. + If multiple source locations are provided, then put them into a list inside QueryItems.OR() + Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). + @param sourceGroupUri: limit the event articles to those that were written by news sources that are assigned to the specified source group. + If multiple source groups are provided, then put them into a list inside QueryItems.OR() + Source group uri for a given name can be obtained using EventRegistry.getSourceGroupUri(). + @param authorUri: find articles that were written by a specific author. + If multiple authors should be considered use QueryItems.OR() to provide the list of authors. + Author uri for a given author name can be obtained using EventRegistry.getAuthorUri(). + @param locationUri: find articles that describe something that occured at a particular location. + If value can be a string or a list of strings provided in QueryItems.OR(). + Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). + @param lang: find articles that are written in the specified language. + If more than one language is specified, resulting articles has to be written in *any* of the languages. + @param dateStart: find articles that were written on or after dateStart. Date should be provided in YYYY-MM-DD format, datetime.time or datetime.datetime. + @param dateEnd: find articles that occured before or on dateEnd. Date should be provided in YYYY-MM-DD format, datetime.time or datetime.datetime. + + @param dateMentionStart: limit the event articles to those that explicitly mention a date that is equal or greater than dateMentionStart. + @param dateMentionEnd: limit the event articles to those that explicitly mention a date that is lower or equal to dateMentionEnd. + @param keywordsLoc: where should we look when searching using the keywords provided by "keywords" parameter. "body" (default), "title", or "body,title" + + @param startSourceRankPercentile: starting percentile of the sources to consider in the results (default: 0). Value should be in range 0-100 and divisible by 10. + @param endSourceRankPercentile: ending percentile of the sources to consider in the results (default: 100). Value should be in range 0-100 and divisible by 10. + @param sortBy: order in which event articles are sorted. Options: id (internal id), date (published date), cosSim (closeness to event centroid), sourceImportanceRank (importance of the news source, custom set), sourceAlexaGlobalRank (global rank of the news source), sourceAlexaCountryRank (country rank of the news source), socialScore (total shares in social media) @param sortByAsc: should the articles be sorted in ascending order (True) or descending (False) based on sortBy value @param returnInfo: what details should be included in the returned information """ + RequestEvent.__init__(self) + QueryParamsBase.__init__(self) assert page >= 1, "page has to be >= 1" assert count <= 100, "at most 100 articles can be returned per call" self.resultType = "articles" self.articlesPage = page self.articlesCount = count - if lang != None: - self.articlesLang = lang + + self._setQueryArrVal(keywords, "keyword", "keywordOper", "and") + self._setQueryArrVal(conceptUri, "conceptUri", "conceptOper", "and") + self._setQueryArrVal(categoryUri, "categoryUri", "categoryOper", "or") + self._setQueryArrVal(sourceUri, "sourceUri", "sourceOper", "or") + self._setQueryArrVal(sourceLocationUri, "sourceLocationUri", None, "or") + self._setQueryArrVal(sourceGroupUri, "sourceGroupUri", "sourceGroupOper", "or") + self._setQueryArrVal(authorUri, "authorUri", "authorOper", "or") + self._setQueryArrVal(locationUri, "locationUri", None, "or") # location such as "http://en.wikipedia.org/wiki/Ljubljana" + + self._setQueryArrVal(lang, "lang", None, "or") # a single lang or list (possible: eng, deu, spa, zho, slv) + + # starting date of the published articles (e.g. 2014-05-02) + if dateStart != None: + self._setDateVal("dateStart", dateStart) + # ending date of the published articles (e.g. 2014-05-02) + if dateEnd != None: + self._setDateVal("dateEnd", dateEnd) + + # first valid mentioned date detected in articles (e.g. 2014-05-02) + if dateMentionStart != None: + self._setDateVal("dateMentionStart", dateMentionStart) + # last valid mentioned date detected in articles (e.g. 2014-05-02) + if dateMentionEnd != None: + self._setDateVal("dateMentionEnd", dateMentionEnd) + + self._setValIfNotDefault("keywordLoc", keywordsLoc, "body") + + assert startSourceRankPercentile >= 0 and startSourceRankPercentile % 10 == 0 and startSourceRankPercentile <= 100 + assert endSourceRankPercentile >= 0 and endSourceRankPercentile % 10 == 0 and endSourceRankPercentile <= 100 + assert startSourceRankPercentile < endSourceRankPercentile + if startSourceRankPercentile != 0: + self._setVal("startSourceRankPercentile", startSourceRankPercentile) + if endSourceRankPercentile != 100: + self._setVal("endSourceRankPercentile", endSourceRankPercentile) + self.articlesSortBy = sortBy self.articlesSortByAsc = sortByAsc + # the filtering params are stored in queryParams. update the params on the self and delete the queryParams object + self.__dict__.update(self.queryParams) self.__dict__.update(returnInfo.getParams("articles")) + del self.queryParams class RequestEventArticleUriWgts(RequestEvent): def __init__(self, lang = None, - sortBy = "cosSim", sortByAsc = False): # order in which event articles are sorted. Options: id (internal id), date (published date), cosSim (closeness to event centroid), socialScore (total shares in social media) + sortBy="cosSim", sortByAsc=False, + **kwds): """ return just a list of article uris and their associated weights @param lang: a single language or a list of languages in which to return the articles. Set None to return all articles @param sortBy: order in which event articles are sorted. Options: id (internal id), date (published date), cosSim (closeness to event centroid), sourceImportanceRank (importance of the news source, custom set), sourceAlexaGlobalRank (global rank of the news source), sourceAlexaCountryRank (country rank of the news source), socialScore (total shares in social media) @param sortByAsc: should the articles be sorted in ascending order (True) or descending (False) based on sortBy value + @param kwds: any other potential query parameters - can be any of the parameters used in RequestEventArticles() constructor """ if lang != None: - self.uriWgtListLang = lang + self.articlesLang = lang self.uriWgtListSortBy = sortBy self.uriWgtListSortByAsc = sortByAsc self.resultType = "uriWgtList" + self.__dict__.update(**kwds) class RequestEventKeywordAggr(RequestEvent): - def __init__(self, lang = "eng"): # the lang parameter should match one of the languages for which we have articles in the event. + def __init__(self, lang=None, + **kwds): """ return keyword aggregate (tag-cloud) from articles in the event - @param lang: language for which to compute the keywords + @param lang: if not `None` then the top keywords will only be computed from the articles in the specified language. + The value should match one of the languages for which we have articles in the event. + @param kwds: any other potential query parameters - can be any of the parameters used in RequestEventArticles() constructor """ self.resultType = "keywordAggr" - self.keywordAggrLang = lang + self.articlesLang = lang + self.__dict__.update(**kwds) @@ -238,13 +432,13 @@ def __init__(self): class RequestEventArticleTrend(RequestEvent): def __init__(self, - lang = mainLangs, + lang = None, page = 1, count = 100, minArticleCosSim = -1, returnInfo = ReturnInfo(articleInfo = ArticleInfoFlags(bodyLen = 0))): """ return trending information for the articles about the event - @param lang: languages for which to compute the trends + @param lang: languages for which to compute the trends. If None, then compute trends for all articles @param page: page of the articles for which to return information (1, 2, ...) @param count: number of articles returned per page (at most 100) @param minArticleCosSim: ignore articles that have cos similarity to centroid lower than the specified value (-1 for no limit) @@ -253,7 +447,7 @@ def __init__(self, assert page >= 1, "page has to be >= 1" assert count <= 100, "at most 100 articles can be returned per call" self.resultType = "articleTrend" - self.articleTrendLang = lang + self.articlesLang = lang self.articleTrendPage = page self.articleTrendCount = count self.articleTrendMinArticleCosSim = minArticleCosSim diff --git a/eventregistry/QueryEvents.py b/eventregistry/QueryEvents.py index 1ae6058..aaeae21 100644 --- a/eventregistry/QueryEvents.py +++ b/eventregistry/QueryEvents.py @@ -12,6 +12,7 @@ def __init__(self, sourceUri = None, sourceLocationUri = None, sourceGroupUri = None, + authorUri = None, locationUri = None, lang = None, dateStart = None, @@ -26,6 +27,7 @@ def __init__(self, ignoreSourceUri = None, ignoreSourceLocationUri = None, ignoreSourceGroupUri = None, + ignoreAuthorUri = None, ignoreLocationUri = None, ignoreLang = None, keywordsLoc = "body", @@ -56,6 +58,9 @@ def __init__(self, @param sourceGroupUri: find events that contain one or more articles that were written by news sources that are assigned to the specified source group. If multiple source groups are provided, then put them into a list inside QueryItems.OR() Source group uri for a given name can be obtained using EventRegistry.getSourceGroupUri(). + @param authorUri: find events that contain one or more articles that have been written by a specific author. + If multiple authors should be considered use QueryItems.OR() or QueryItems.AND() to provide the list of authors. + Author uri for a given author name can be obtained using EventRegistry.getAuthorUri(). @param locationUri: find events that occured at a particular location. If value can be a string or a list of strings provided in QueryItems.OR(). Location uri can either be a city or a country. Location uri for a given name can be obtained using EventRegistry.getLocationUri(). @@ -73,6 +78,7 @@ def __init__(self, @param ignoreSourceUri: ignore events that have have articles which have been written by any of the specified news sources @param ignoreSourceLocationUri: ignore events that have articles which been written by sources located at *any* of the specified locations @param ignoreSourceGroupUri: ignore events that have articles which have been written by sources in *any* of the specified source groups + @param ignoreAuthorUri: ignore articles that were written by *any* of the specified authors @param ignoreLocationUri: ignore events that occured in any of the provided locations. A location can be a city or a place @param ignoreLang: ignore events that are reported in any of the provided languages @param keywordsLoc: what data should be used when searching using the keywords provided by "keywords" parameter. "body" (default), "title", or "body,title" @@ -89,6 +95,7 @@ def __init__(self, self._setQueryArrVal(sourceUri, "sourceUri", "sourceOper", "or") self._setQueryArrVal(sourceLocationUri, "sourceLocationUri", None, "or") self._setQueryArrVal(sourceGroupUri, "sourceGroupUri", "sourceGroupOper", "or") + self._setQueryArrVal(authorUri, "authorUri", "authorOper", "or") self._setQueryArrVal(locationUri, "locationUri", None, "or") # location such as "http://en.wikipedia.org/wiki/Ljubljana" self._setQueryArrVal(lang, "lang", None, "or") # a single lang or list (possible: eng, deu, spa, zho, slv) @@ -113,6 +120,7 @@ def __init__(self, self._setQueryArrVal(ignoreSourceUri, "ignoreSourceUri", None, "or") self._setQueryArrVal(ignoreSourceLocationUri, "ignoreSourceLocationUri", None, "or") self._setQueryArrVal(ignoreSourceGroupUri, "ignoreSourceGroupUri", None, "or") + self._setQueryArrVal(ignoreAuthorUri, "ignoreAuthorUri", None, "or") self._setQueryArrVal(ignoreLocationUri, "ignoreLocationUri", None, "or") self._setQueryArrVal(ignoreLang, "ignoreLang", None, "or") @@ -210,6 +218,7 @@ def execQuery(self, eventRegistry, @param sortBy: how should the resulting events be sorted. Options: date (by event date), rel (relevance to the query), size (number of articles), socialScore (amount of shares in social media), none (no specific sorting) @param sortByAsc: should the results be sorted in ascending order (True) or descending (False) + @param returnInfo: what details should be included in the returned information @param maxItems: maximum number of items to be returned. Used to stop iteration sooner than results run out """ self._er = eventRegistry @@ -365,13 +374,14 @@ def __init__(self): class RequestEventsKeywordAggr(RequestEvents): - def __init__(self, lang = "eng"): + def __init__(self, lang = None): """ return keyword aggregate (tag cloud) on words in articles in resulting events - @param lang: in which language to produce the list of top keywords + @param lang: in which language to produce the list of top keywords. If None, then compute on all articles """ self.resultType = "keywordAggr" - self.keywordAggrLang = lang + if lang != None: + self.keywordAggrLang = lang @@ -430,15 +440,15 @@ def __init__(self, class RequestEventsConceptGraph(RequestEvents): def __init__(self, - conceptCount = 25, - linkCount = 50, - eventsSampleSize = 100000, + conceptCount = 50, + linkCount = 150, + eventsSampleSize = 50000, returnInfo = ReturnInfo()): """ compute which concept pairs frequently co-occur together in the resulting events - @param conceptCount: number of top concepts to return (at most 1000) - @param linkCount: number of links between the concepts to return (at most 2000) - @param eventsSampleSize: on what sample of results should the aggregate be computed (at most 300000) + @param conceptCount: number of top concepts to return (at most 1,000) + @param linkCount: number of links between the concepts to return (at most 2,000) + @param eventsSampleSize: on what sample of results should the aggregate be computed (at most 100000) @param returnInfo: what details about the concepts should be included in the returned information """ assert conceptCount <= 1000 @@ -500,7 +510,7 @@ def __init__(self, class RequestEventsSourceAggr(RequestEvents): def __init__(self, sourceCount = 30, - eventsSampleSize = 100000, + eventsSampleSize = 50000, returnInfo = ReturnInfo()): """ return top news sources that report about the events that match the search conditions @@ -509,7 +519,7 @@ def __init__(self, @param returnInfo: what details about the sources should be included in the returned information """ assert sourceCount <= 200 - assert eventsSampleSize <= 300000 + assert eventsSampleSize <= 100000 self.resultType = "sourceAggr" self.sourceAggrSourceCount = sourceCount self.sourceAggrSampleSize = eventsSampleSize @@ -543,7 +553,7 @@ def __init__(self, returnInfo = ReturnInfo()): """ return hierarchical clustering of events into smaller clusters. 2-means clustering is applied on each node in the tree - @param keywordCount: number of keywords to report in each of the clusters (at most !00) + @param keywordCount: number of keywords to report in each of the clusters (at most 100) @param maxEventsToCluster: try to cluster at most this number of events (at most 10000) @param returnInfo: what details about the concepts should be included in the returned information """ diff --git a/eventregistry/ReturnInfo.py b/eventregistry/ReturnInfo.py index b8d006b..a744cbc 100644 --- a/eventregistry/ReturnInfo.py +++ b/eventregistry/ReturnInfo.py @@ -64,6 +64,7 @@ class ArticleInfoFlags(ReturnInfoFlagsBase): @param body: article body @param url: article url @param eventUri: uri of the event to which the article belongs + @param authors: the list of authors of the news article @param concepts: the list of concepts mentioned in the article @param categories: the list of categories assigned to the article @param links: the list of urls of links identified in the article body @@ -85,6 +86,7 @@ def __init__(self, body = True, url = True, eventUri = True, + authors = True, concepts = False, categories = False, links = False, @@ -104,6 +106,7 @@ def __init__(self, self._setFlag("includeArticleBody", body, True) self._setFlag("includeArticleUrl", url, True) self._setFlag("includeArticleEventUri", eventUri, True) + self._setFlag("includeArticleAuthors", authors, True) self._setFlag("includeArticleConcepts", concepts, False) self._setFlag("includeArticleCategories", categories, False) self._setFlag("includeArticleLinks", links, False) diff --git a/eventregistry/tests/TestQueryArticles.py b/eventregistry/tests/TestQueryArticles.py index 0cd3a13..0c3b5f6 100644 --- a/eventregistry/tests/TestQueryArticles.py +++ b/eventregistry/tests/TestQueryArticles.py @@ -180,6 +180,28 @@ def testArticleListWithSourceGroupAndLocationSearch(self): self.assertTrue(loc.get("country").get("wikiUri") == usUri) + def testArticleListWithAuthorSearch(self): + """ + make sure that search for author returns articles by that author + """ + authorUri = self.er.getAuthorUri("associated") + q = QueryArticles(authorUri = authorUri) + q.setRequestedResult(RequestArticlesInfo(count = 100, returnInfo = self.returnInfo)) + res = self.er.execQuery(q) + for art in res.get("articles", {}).get("results", []): + foundAuthor = False + for author in art.get("authors"): + if author["uri"] == authorUri: + foundAuthor = True + assert foundAuthor == True + + cq = ComplexArticleQuery(BaseQuery(authorUri = authorUri)) + q = QueryArticles.initWithComplexQuery(cq) + q.setRequestedResult(RequestArticlesInfo(count = 100, returnInfo = self.returnInfo)) + res2 = self.er.execQuery(q) + + self.ensureSameResults(res, res2, '[articles][].totalResults') + def testArticleListWithCategorySearch(self): disasterUri = self.er.getCategoryUri("disa") diff --git a/eventregistry/tests/TestQueryEvent.py b/eventregistry/tests/TestQueryEvent.py index 8204b90..8aeb6db 100644 --- a/eventregistry/tests/TestQueryEvent.py +++ b/eventregistry/tests/TestQueryEvent.py @@ -11,18 +11,46 @@ def getValidEvent(self): return EventRegistry.getUriFromUriWgt(res["uriWgtList"]["results"])[0] + def testEventArticleFiltering(self): + q1 = QueryEventArticlesIter("eng-2860795") + counts1 = q1.count(self.er) + counts2 = QueryEventArticlesIter("eng-2860795", lang="eng").count(self.er) + self.assertTrue(counts1 != counts2) + counts3 = QueryEventArticlesIter("eng-2860795", conceptUri=self.er.getConceptUri("Donald Trump")).count(self.er) + self.assertTrue(counts1 != counts3) + counts4 = QueryEventArticlesIter("eng-2860795", keywords = "Trump").count(self.er) + self.assertTrue(counts1 != counts4) + counts5 = QueryEventArticlesIter("eng-2860795", sourceUri = self.er.getNewsSourceUri("fox")).count(self.er) + self.assertTrue(counts1 != counts5) + counts6 = QueryEventArticlesIter("eng-2860795", lang="eng", conceptUri=self.er.getConceptUri("Donald Trump")).count(self.er) + self.assertTrue(counts1 != counts6) + + arts1 = [art for art in q1.execQuery(self.er)] + self.assertTrue(counts1 == len(arts1)) + + q = QueryEvent("eng-2860795") + q.setRequestedResult(RequestEventArticles(lang="eng", conceptUri=self.er.getConceptUri("Donald Trump"))) + res = self.er.execQuery(q) + self.assertTrue(counts6 == res["eng-2860795"]["articles"]["totalResults"]) + + + def testArticleSorting(self): q = QueryEventArticlesIter(self.getValidEvent()) # try ascending order - wgt = 0 + wgt = None for art in q.execQuery(self.er, sortBy="date", sortByAsc=True): + if wgt == None: + wgt = art["wgt"] self.assertTrue(art["wgt"] >= wgt) wgt = art["wgt"] # try descending order - wgt = sys.maxint + wgt = None for art in q.execQuery(self.er, sortBy="date", sortByAsc=False): + if wgt == None: + wgt = art["wgt"] self.assertTrue(art["wgt"] <= wgt) wgt = art["wgt"] @@ -138,7 +166,7 @@ def testEventArticlesIterator(self): # check that the iterator really downloads all articles in the event iter = QueryEventArticlesIter("eng-2866653") articleCount = iter.count(self.er) - articles = [art for art in iter.execQuery(self.er, lang = allLangs)] + articles = [art for art in iter.execQuery(self.er)] if articleCount != len(articles): self.fail("Event article iterator did not generate the full list of event articles") diff --git a/eventregistry/tests/TestQueryEvents.py b/eventregistry/tests/TestQueryEvents.py index 3a6b2a2..0719cc2 100644 --- a/eventregistry/tests/TestQueryEvents.py +++ b/eventregistry/tests/TestQueryEvents.py @@ -33,7 +33,8 @@ def testEventListWithKeywordSearch(self): res = self.er.execQuery(q) self.validateGeneralEventList(res) - q2 = QueryEvents(keywords = "germany") + q2 = QueryEvents.initWithComplexQuery(ComplexEventQuery( + BaseQuery(keyword = "germany"))) q2.setRequestedResult(RequestEventsInfo(count = 10, returnInfo = self.returnInfo)) res2 = self.er.execQuery(q2) self.validateGeneralEventList(res2) @@ -47,7 +48,8 @@ def testEventListWithSourceSearch(self): res = self.er.execQuery(q) self.validateGeneralEventList(res) - q2 = QueryEvents(sourceUri = self.er.getNewsSourceUri("bbc")) + q2 = QueryEvents.initWithComplexQuery(ComplexEventQuery( + BaseQuery(sourceUri = self.er.getNewsSourceUri("bbc")))) q2.setRequestedResult(RequestEventsInfo(count = 10, returnInfo = self.returnInfo)) res2 = self.er.execQuery(q2) self.validateGeneralEventList(res2) @@ -59,7 +61,20 @@ def testEventListWithCategorySearch(self): q = QueryEvents(categoryUri = self.er.getCategoryUri("disa")) res = self.er.execQuery(q) - q2 = QueryEvents(categoryUri = self.er.getCategoryUri("disa")) + q2 = QueryEvents.initWithComplexQuery(ComplexEventQuery( + BaseQuery(categoryUri = self.er.getCategoryUri("disa")))) + res2 = self.er.execQuery(q2) + + self.ensureSameResults(res, res2, '[events][].totalResults') + + + def testEventListWithAuthorSearch(self): + authorUri = self.er.getAuthorUri("associated") + q = QueryEvents(authorUri = authorUri) + res = self.er.execQuery(q) + + q2 = QueryEvents.initWithComplexQuery(ComplexEventQuery( + BaseQuery(authorUri = authorUri))) res2 = self.er.execQuery(q2) self.ensureSameResults(res, res2, '[events][].totalResults')