diff --git a/lib/extractor/general/national/india/theindianexpress.dart b/lib/extractor/general/national/india/theindianexpress.dart new file mode 100644 index 0000000..b78b9fb --- /dev/null +++ b/lib/extractor/general/national/india/theindianexpress.dart @@ -0,0 +1,121 @@ +import 'package:html/dom.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'dart:convert'; +import 'package:http/http.dart' as http; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/utils/time.dart'; + +class TheIndianExpress extends Publisher { + @override + String get name => "The Indian Express"; + + @override + String get homePage => "https://indianexpress.com"; + + @override + Future> get categories => extractCategories(); + + @override + Category get mainCategory => Category.india; + + @override + bool get hasSearchSupport => false; + + Future> extractCategories() async { + Map map = {}; + var response = await http.get(Uri.parse(homePage)); + if (response.statusCode == 200) { + var document = html_parser.parse(utf8.decode(response.bodyBytes)); + document + .querySelectorAll("#navbar a") + .sublist(1) + .forEach((element) { + map.putIfAbsent( + element.text.trim(), + () { + return element.attributes["href"]! + .replaceFirst(homePage, ""); + }, + ); + }); + } + var unsupported = ["Opinion", "Explained", "Entertainment", "Tech", "Research", "Videos"]; + return map..removeWhere((key, value) => !value.contains("section") || unsupported.contains(key)); + } + + @override + Future article(NewsArticle newsArticle) async { + var response = await http.get(Uri.parse("$homePage${newsArticle.url}")); + if (response.statusCode == 200) { + Document document = html_parser.parse(utf8.decode(response.bodyBytes)); + var content = document.querySelector("#pcl-full-content")?.innerHtml ?? ""; + var thumbnail = document.querySelector(".custom-caption img")?.attributes["content"]; + var excerpt = document.querySelector(".synopsis")?.text; + var timestamp = document.querySelector("span[itemprop=dateModified]")?.attributes["content"]; + var tags = document.querySelectorAll(".m-breadcrumb a").sublist(1).map((e) => e.text).toList(); + return newsArticle.fill( + content: content, + thumbnail: thumbnail, + excerpt: excerpt, + publishedAt: parseDateString(timestamp??"", format: "yyyy-MM-ddTHH:mm:ssZ"), + tags: tags, + ); + } + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "/", + int page = 1, + }) async { + if(category=="/") + category = "/latest-news"; + + Set articles = {}; + String url = "$homePage$category/page/$page"; + + final response = await http.get(Uri.parse(url)); + if (response.statusCode == 200) { + Document document = html_parser.parse(utf8.decode(response.bodyBytes)); + List articleElements = []; + if(page==1) + articleElements = document.querySelectorAll(".swiper-slide") + document.querySelectorAll(".nation .articles"); + else + articleElements = document.querySelectorAll(".nation .articles"); + for (var article in articleElements) { + List tags = []; + var title = article.querySelector("a")?.text.trim() ?? ""; + if (title.trim().isEmpty) + title = article.querySelector("h2 a")?.text.trim() ?? ""; + var articleUrl = article.querySelector("a")?.attributes["href"] ?? ""; + var thumbnail = article.querySelector("img")?.attributes["src"]; + var timestamp = article.querySelector(".date")?.text.replaceAll(" ", " ").trim() ?? ""; + var excerpt = article.querySelector(".date+p")?.text ?? ""; + timestamp = timestamp.contains("Updated:")?timestamp.split("Updated:")[1].trim():timestamp; + articles.add(NewsArticle( + publisher: this, + title: title ?? "", + content: "", + excerpt: excerpt, + author: "", + url: articleUrl.replaceFirst(homePage, ""), + tags: tags, + thumbnail: thumbnail ?? "", + publishedAt: parseDateString(timestamp, format: "MMMM d, yyyy HH:mm z"), + category: category), + ); + } + } + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + return {}; + } +} diff --git a/lib/model/publisher.dart b/lib/model/publisher.dart index f94b13a..c9f7408 100644 --- a/lib/model/publisher.dart +++ b/lib/model/publisher.dart @@ -1,4 +1,5 @@ import 'package:raven/extractor/general/national/india/thehindu.dart'; +import 'package:raven/extractor/general/national/india/theindianexpress.dart'; import 'package:raven/extractor/general/national/india/thequint.dart'; import 'package:raven/extractor/general/national/india/thewire.dart'; import 'package:raven/extractor/general/world/aljazeera.dart'; @@ -26,6 +27,7 @@ Map publishers = { "Reuters": Reuters(), "The Guardian": TheGuardian(), "The Hindu": TheHindu(), + "The Indian Express": TheIndianExpress(), "The Verge": TheVerge(), "The Quint": TheQuint(), "The Wire": TheWire(), diff --git a/test/extractor/general/national/india/theindianexpress_test.dart b/test/extractor/general/national/india/theindianexpress_test.dart new file mode 100644 index 0000000..0eb456b --- /dev/null +++ b/test/extractor/general/national/india/theindianexpress_test.dart @@ -0,0 +1,22 @@ +import 'package:raven/extractor/general/national/india/theindianexpress.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + + +void main() { + Publisher publisher = TheIndianExpress(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +}