From 75261995bdb68f54504153313c029033aeb66857 Mon Sep 17 00:00:00 2001 From: kshib Date: Thu, 4 Jul 2024 23:08:25 +0530 Subject: [PATCH 1/5] add prothamalo --- .../national/bangladesh/prothamalo.dart | 148 ++++++++++++++++++ .../national/bangladesh/prothamalo_en.dart | 146 +++++++++++++++++ lib/model/publisher.dart | 4 + .../bangladesh/prothamalo_en_test.dart | 22 +++ .../national/bangladesh/prothamalo_test.dart | 22 +++ 5 files changed, 342 insertions(+) create mode 100644 lib/extractor/general/national/bangladesh/prothamalo.dart create mode 100644 lib/extractor/general/national/bangladesh/prothamalo_en.dart create mode 100644 test/extractor/general/national/bangladesh/prothamalo_en_test.dart create mode 100644 test/extractor/general/national/bangladesh/prothamalo_test.dart diff --git a/lib/extractor/general/national/bangladesh/prothamalo.dart b/lib/extractor/general/national/bangladesh/prothamalo.dart new file mode 100644 index 0000000..190b737 --- /dev/null +++ b/lib/extractor/general/national/bangladesh/prothamalo.dart @@ -0,0 +1,148 @@ +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; + +class ProthamAlo extends Publisher { + @override + String get name => "প্রথম আলো"; + + @override + String get homePage => "https://www.prothomalo.com"; + + @override + Future> get categories => extractCategories(); + + @override + Category get mainCategory => Category.india; + + Future> extractCategories() async { + return { + "সর্বশেষ": "latest", + "রাজনীতি": "politics", + "বাংলাদেশ": "bangladesh", + "অপরাধ": "crime-bangladesh", + "বিশ্ব": "world-all", + "বাণিজ্য": "business-all", + "মতামত": "opinion-all", + "খেলা": "sports-all", + "বিনোদন": "entertainment-all", + "জীবনযাপন": "lifestyle-all", + }; + } + + @override + Future> categoryArticles( + {String category = "latest", int page = 1}) async { + Set articles = {}; + var limit = 10; + var offset = limit * (page - 1); + String apiUrl = + "$homePage/api/v1/collections/$category?offset=$offset&limit=$limit"; + await dio().get(apiUrl).then( + (response) { + if (response.statusCode == 200) { + var articlesData = response.data; + var data = articlesData["items"]; + for (var element in data) { + var title = element['item']['headline'][0]; + var author = element['story']["author-name"]; + var thumbnail = element['story']["hero-image-s3-key"] ?? + element['story']["alternative"]["home"]["default"]["hero-image"] + ["hero-image-s3-key"] ?? + ""; + var time = element['story']["published-at"]; + var articleUrl = element['story']['slug']; + var excerpt = element['story']['summary']; + var tags = + element['story']['sections'].map((e) => e['name']).toList(); + articles.add( + NewsArticle( + publisher: name, + title: title ?? "", + content: "", + excerpt: excerpt ?? "", + author: author ?? "", + url: articleUrl, + thumbnail: thumbnail ?? "", + category: category, + publishedAt: time, + tags: List.from(tags), + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles( + {required String searchQuery, int page = 1}) async { + Set articles = {}; + var limit = 10; + var offset = limit * (page - 1); + String apiUrl = + "$homePage/route-data.json?path=/search&q=$searchQuery&offset=$offset&limit=$limit"; + await dio().get(apiUrl).then( + (response) { + if (response.statusCode == 200) { + var articlesData = response.data; + var data = articlesData["data"]["stories"]; + for (var element in data) { + var title = element['headline'][0]; + var author = element["author-name"]; + var thumbnail = element["hero-image-s3-key"] ?? ""; + var time = element["published-at"]; + var articleUrl = element['slug']; + var excerpt = element['summary']; + var tags = element['sections'].map((e) => e['name']).toList(); + articles.add(NewsArticle( + publisher: name, + title: title ?? "", + content: "", + excerpt: excerpt ?? "", + author: author ?? "", + url: articleUrl, + thumbnail: thumbnail ?? "", + category: searchQuery, + publishedAt: time, + tags: List.from(tags),),); + } + } + }, + ); + + return articles; + } + + @override + Future article(NewsArticle newsArticle) async { + await dio() + .get('$homePage/route-data.json?path=${newsArticle.url}') + .then((response) { + if (response.statusCode == 200) { + var data = (response.data); + var content = ""; + var cards = data["data"]["story"]["cards"]; + for (var card in cards) { + if (card["story-elements"][0]["type"] == "text") { + content += card["story-elements"][0]["text"]; + } else if (card["story-elements"][0]["type"] == "image") { + var image = card["story-elements"][0]["image-s3-key"]; + content += "

"; + } + } + newsArticle.content = content; + } + }); + return newsArticle; + } + + @override + Future> articles( + {String category = "home", int page = 1}) async { + return super.articles(category: category, page: page); + } +} diff --git a/lib/extractor/general/national/bangladesh/prothamalo_en.dart b/lib/extractor/general/national/bangladesh/prothamalo_en.dart new file mode 100644 index 0000000..00dd2e5 --- /dev/null +++ b/lib/extractor/general/national/bangladesh/prothamalo_en.dart @@ -0,0 +1,146 @@ +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; + +class ProthamAloEn extends Publisher { + @override + String get name => "Prothom Alo"; + + @override + String get homePage => "https://en.prothomalo.com"; + + @override + Future> get categories => extractCategories(); + + @override + Category get mainCategory => Category.india; + + Future> extractCategories() async { + return { + "Bangladesh": "bangladesh", + "International": "international", + "Sports": "sports", + "Opinion": "opinion", + "Business": "business", + "Youth": "youth", + "Entertainment": "entertainment", + "Lifestyle": "lifestyle", + }; + } + + @override + Future> categoryArticles( + {String category = "latest", int page = 1}) async { + Set articles = {}; + var limit = 10; + var offset = limit * (page - 1); + String apiUrl = + "$homePage/api/v1/collections/$category?offset=$offset&limit=$limit"; + await dio().get(apiUrl).then( + (response) { + if (response.statusCode == 200) { + var articlesData = response.data; + var data = articlesData["items"]; + for (var element in data) { + var title = element['item']['headline'][0]; + var author = element['story']["author-name"]; + var thumbnail = element['story']["hero-image-s3-key"] ?? + element['story']["alternative"]["home"]["default"]["hero-image"] + ["hero-image-s3-key"] ?? + ""; + var time = element['story']["published-at"]; + var articleUrl = element['story']['slug']; + var excerpt = element['story']['summary']; + var tags = + element['story']['sections'].map((e) => e['name']).toList(); + articles.add( + NewsArticle( + publisher: name, + title: title ?? "", + content: "", + excerpt: excerpt ?? "", + author: author ?? "", + url: articleUrl, + thumbnail: thumbnail ?? "", + category: category, + publishedAt: time, + tags: List.from(tags), + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles( + {required String searchQuery, int page = 1}) async { + Set articles = {}; + var limit = 10; + var offset = limit * (page - 1); + String apiUrl = + "$homePage/route-data.json?path=/search&q=$searchQuery&offset=$offset&limit=$limit"; + await dio().get(apiUrl).then( + (response) { + if (response.statusCode == 200) { + var articlesData = response.data; + var data = articlesData["data"]["stories"]; + for (var element in data) { + var title = element['headline'][0]; + var author = element["author-name"]; + var thumbnail = element["hero-image-s3-key"] ?? ""; + var time = element["published-at"]; + var articleUrl = element['slug']; + var excerpt = element['summary']; + var tags = element['sections'].map((e) => e['name']).toList(); + articles.add(NewsArticle( + publisher: name, + title: title ?? "", + content: "", + excerpt: excerpt ?? "", + author: author ?? "", + url: articleUrl, + thumbnail: thumbnail ?? "", + category: searchQuery, + publishedAt: time, + tags: List.from(tags),),); + } + } + }, + ); + + return articles; + } + + @override + Future article(NewsArticle newsArticle) async { + await dio() + .get('$homePage/route-data.json?path=${newsArticle.url}') + .then((response) { + if (response.statusCode == 200) { + var data = (response.data); + var content = ""; + var cards = data["data"]["story"]["cards"]; + for (var card in cards) { + if (card["story-elements"][0]["type"] == "text") { + content += card["story-elements"][0]["text"]; + } else if (card["story-elements"][0]["type"] == "image") { + var image = card["story-elements"][0]["image-s3-key"]; + content += "

"; + } + } + newsArticle.content = content; + } + }); + return newsArticle; + } + + @override + Future> articles( + {String category = "home", int page = 1}) async { + return super.articles(category: category, page: page); + } +} diff --git a/lib/model/publisher.dart b/lib/model/publisher.dart index d9c056b..84cf775 100644 --- a/lib/model/publisher.dart +++ b/lib/model/publisher.dart @@ -1,5 +1,7 @@ import 'package:raven/extractor/custom/morss.dart'; import 'package:raven/extractor/custom/rss.dart'; +import 'package:raven/extractor/general/national/bangladesh/prothamalo.dart'; +import 'package:raven/extractor/general/national/bangladesh/prothamalo_en.dart'; import 'package:raven/extractor/general/national/india/thehindu.dart'; import 'package:raven/extractor/general/national/india/theindianexpress.dart'; import 'package:raven/extractor/general/national/india/thequint.dart'; @@ -30,6 +32,7 @@ Map publishers = { "CNN": CNN(), "Engadget": Engadget(), "morss": Morss(), + "Protham Alo": ProthamAloEn(), "Reuters": Reuters(), "RSS Feed": RSSFeed(), "The Guardian": TheGuardian(), @@ -40,6 +43,7 @@ Map publishers = { "The Wire": TheWire(), "TorrentFreak": TorrentFreak(), "XDA Developers": XDAdevelopers(), + "প্রথম আলো": ProthamAlo(), }; enum Category { diff --git a/test/extractor/general/national/bangladesh/prothamalo_en_test.dart b/test/extractor/general/national/bangladesh/prothamalo_en_test.dart new file mode 100644 index 0000000..a302c8e --- /dev/null +++ b/test/extractor/general/national/bangladesh/prothamalo_en_test.dart @@ -0,0 +1,22 @@ +import 'package:raven/extractor/general/national/bangladesh/prothamalo.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + + +void main() { + Publisher publisher = ProthamAlo(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'ওয়ার্ল্ড'); + }); +} diff --git a/test/extractor/general/national/bangladesh/prothamalo_test.dart b/test/extractor/general/national/bangladesh/prothamalo_test.dart new file mode 100644 index 0000000..a302c8e --- /dev/null +++ b/test/extractor/general/national/bangladesh/prothamalo_test.dart @@ -0,0 +1,22 @@ +import 'package:raven/extractor/general/national/bangladesh/prothamalo.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + + +void main() { + Publisher publisher = ProthamAlo(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'ওয়ার্ল্ড'); + }); +} From 738df34318197c9366a5cdcd2b74aea1cea8c939 Mon Sep 17 00:00:00 2001 From: kshib Date: Mon, 8 Jul 2024 22:26:22 +0530 Subject: [PATCH 2/5] add rfa --- .../general/national/china/rfa_cantonese.dart | 175 ++++++++++++++++++ .../general/national/china/rfa_mandarin.dart | 175 ++++++++++++++++++ .../general/national/china/rfa_tibetan.dart | 175 ++++++++++++++++++ .../general/national/myanmar/rfa_burmese.dart | 175 ++++++++++++++++++ lib/extractor/general/world/rfa_english.dart | 175 ++++++++++++++++++ .../national/china/rfa_cantonese_test.dart | 21 +++ .../national/china/rfa_mandarin_test.dart | 21 +++ .../national/china/rfa_tibetan_test.dart | 22 +++ .../national/myanmar/rfa_burmese_test.dart | 23 +++ .../general/world/rfa_english_test.dart | 21 +++ 10 files changed, 983 insertions(+) create mode 100644 lib/extractor/general/national/china/rfa_cantonese.dart create mode 100644 lib/extractor/general/national/china/rfa_mandarin.dart create mode 100644 lib/extractor/general/national/china/rfa_tibetan.dart create mode 100644 lib/extractor/general/national/myanmar/rfa_burmese.dart create mode 100644 lib/extractor/general/world/rfa_english.dart create mode 100644 test/extractor/general/national/china/rfa_cantonese_test.dart create mode 100644 test/extractor/general/national/china/rfa_mandarin_test.dart create mode 100644 test/extractor/general/national/china/rfa_tibetan_test.dart create mode 100644 test/extractor/general/national/myanmar/rfa_burmese_test.dart create mode 100644 test/extractor/general/world/rfa_english_test.dart diff --git a/lib/extractor/general/national/china/rfa_cantonese.dart b/lib/extractor/general/national/china/rfa_cantonese.dart new file mode 100644 index 0000000..caa95f8 --- /dev/null +++ b/lib/extractor/general/national/china/rfa_cantonese.dart @@ -0,0 +1,175 @@ +import 'package:dio/dio.dart'; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'package:raven/utils/time.dart'; + +class RfaCantonese extends Publisher { + @override + String get name => "RFA 自由亞洲電台粵語部"; + + @override + String get homePage => "https://www.rfa.org/cantonese"; + + @override + Future> get categories async { + Map map = {"Cantonese": "cantonese"}; + await dio().get(homePage, options: options).then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var elements = document.querySelectorAll("*[class*='nav-'] a"); + for (var element in elements) { + if(element.attributes["href"]==homePage) { + continue; + } + map.putIfAbsent( + element.text, + () { + return element.attributes["href"]!.replaceAll("$homePage/", ""); + }, + ); + } + } + }, + ); + return map..removeWhere((key, value) => ["video", "audio", "send_news_form"].contains(value) || value.contains("about/"),); + } + + @override + Category get mainCategory => Category.world; + + @override + bool get hasSearchSupport => true; + + Options options = Options(headers: { + "User-Agent": + "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }); + + @override + Future article(NewsArticle newsArticle) async { + await dio().get(newsArticle.url, options: options).then((response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var content = document.querySelector('#storytext')?.text ?? ""; + var author = document.querySelector("#story_byline")?.text ?? ""; + var thumbnail = document.querySelector("#headerimg img")?.text ?? ""; + + newsArticle = newsArticle.fill( + content: content, + author: author, + tags: [], + thumbnail: thumbnail, + ); + } + }); + + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "news", + int page = 1, + }) async { + Set articles = {}; + var limit = 15; + var offset = (page - 1) * limit; + + await dio().get( + "$homePage/$category/story_archive?b_start:int=$offset", + options: options, + ) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".sectionteaser"); + for (var article in data) { + var title = article.querySelector("span")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = article.querySelector(".story_date")?.text ?? ""; + var excerpt = + article.querySelector("story_description")?.text ?? ""; + var url = article.querySelector("a")?.attributes["href"] ?? ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: category, + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + Set articles = {}; + var limit = 30; + var offset = (page - 1) * limit; + await dio() + .get( + "$homePage/@@search?SearchableText=$searchQuery&sort_on=Date&b_start:int=$offset", + options: options) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".searchresult"); + for (var article in data) { + var title = article.querySelector("a.state-published")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = + article.querySelector(".searchresultdate")?.text.trim() ?? ""; + var excerpt = + article.querySelector(".croppedDescription")?.text ?? ""; + var url = article + .querySelector("a.state-published") + ?.attributes["href"] ?? + ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: searchQuery, + ), + ); + } + } + }, + ); + + return articles; + } +} diff --git a/lib/extractor/general/national/china/rfa_mandarin.dart b/lib/extractor/general/national/china/rfa_mandarin.dart new file mode 100644 index 0000000..784a9ec --- /dev/null +++ b/lib/extractor/general/national/china/rfa_mandarin.dart @@ -0,0 +1,175 @@ +import 'package:dio/dio.dart'; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'package:raven/utils/time.dart'; + +class RfaMandarin extends Publisher { + @override + String get name => "自由亚洲电台"; + + @override + String get homePage => "https://www.rfa.org/mandarin"; + + @override + Future> get categories async { + Map map = {"Mandarin": "mandarin"}; + await dio().get(homePage, options: options).then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var elements = document.querySelectorAll("*[class*='nav-'] a"); + for (var element in elements) { + if(element.attributes["href"]==homePage) { + continue; + } + map.putIfAbsent( + element.text, + () { + return element.attributes["href"]!.replaceAll("$homePage/", ""); + }, + ); + } + } + }, + ); + return map; + } + + @override + Category get mainCategory => Category.world; + + @override + bool get hasSearchSupport => true; + + Options options = Options(headers: { + "User-Agent": + "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }); + + @override + Future article(NewsArticle newsArticle) async { + await dio().get(newsArticle.url, options: options).then((response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var content = document.querySelector('#storytext')?.text ?? ""; + var author = document.querySelector("#story_byline")?.text ?? ""; + var thumbnail = document.querySelector("#headerimg img")?.text ?? ""; + + newsArticle = newsArticle.fill( + content: content, + author: author, + tags: [], + thumbnail: thumbnail, + ); + } + }); + + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "news", + int page = 1, + }) async { + Set articles = {}; + var limit = 15; + var offset = (page - 1) * limit; + + await dio().get( + "$homePage/$category/story_archive?b_start:int=$offset", + options: options, + ) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".sectionteaser"); + for (var article in data) { + var title = article.querySelector("span")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = article.querySelector(".story_date")?.text ?? ""; + var excerpt = + article.querySelector("story_description")?.text ?? ""; + var url = article.querySelector("a")?.attributes["href"] ?? ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: category, + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + Set articles = {}; + var limit = 30; + var offset = (page - 1) * limit; + await dio() + .get( + "$homePage/@@search?SearchableText=$searchQuery&sort_on=Date&b_start:int=$offset", + options: options) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".searchresult"); + for (var article in data) { + var title = article.querySelector("a.state-published")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = + article.querySelector(".searchresultdate")?.text.trim() ?? ""; + var excerpt = + article.querySelector(".croppedDescription")?.text ?? ""; + var url = article + .querySelector("a.state-published") + ?.attributes["href"] ?? + ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: searchQuery, + ), + ); + } + } + }, + ); + + return articles; + } +} diff --git a/lib/extractor/general/national/china/rfa_tibetan.dart b/lib/extractor/general/national/china/rfa_tibetan.dart new file mode 100644 index 0000000..5c75c0f --- /dev/null +++ b/lib/extractor/general/national/china/rfa_tibetan.dart @@ -0,0 +1,175 @@ +import 'package:dio/dio.dart'; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'package:raven/utils/time.dart'; + +class RfaTibetan extends Publisher { + @override + String get name => "ཨེ་ཤེ་ཡ་རང་དབང་རླུང་འཕྲིན་ཁང་"; + + @override + String get homePage => "https://www.rfa.org/tibetan"; + + @override + Future> get categories async { + Map map = {"Tibetan": "tibetan"}; + await dio().get(homePage, options: options).then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var elements = document.querySelectorAll(".header_top li a"); + for (var element in elements) { + if(element.attributes["href"]==homePage) { + continue; + } + map.putIfAbsent( + element.text, + () { + return element.attributes["href"]!.replaceAll("$homePage/", ""); + }, + ); + } + } + }, + ); + return map..removeWhere((key, value) => value=="video",); + } + + @override + Category get mainCategory => Category.world; + + @override + bool get hasSearchSupport => true; + + Options options = Options(headers: { + "User-Agent": + "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }); + + @override + Future article(NewsArticle newsArticle) async { + await dio().get(newsArticle.url, options: options).then((response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var content = document.querySelector('#storytext')?.text ?? ""; + var author = document.querySelector("#story_byline")?.text ?? ""; + var thumbnail = document.querySelector("#headerimg img")?.text ?? ""; + + newsArticle = newsArticle.fill( + content: content, + author: author, + tags: [], + thumbnail: thumbnail, + ); + } + }); + + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "news", + int page = 1, + }) async { + Set articles = {}; + var limit = 15; + var offset = (page - 1) * limit; + + await dio().get( + "$homePage/$category/story_archive?b_start:int=$offset", + options: options, + ) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".sectionteaser"); + for (var article in data) { + var title = article.querySelector("span")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = article.querySelector(".story_date")?.text ?? ""; + var excerpt = + article.querySelector("story_description")?.text ?? ""; + var url = article.querySelector("a")?.attributes["href"] ?? ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: category, + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + Set articles = {}; + var limit = 30; + var offset = (page - 1) * limit; + await dio() + .get( + "$homePage/@@search?SearchableText=$searchQuery&sort_on=Date&b_start:int=$offset", + options: options) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".searchresult"); + for (var article in data) { + var title = article.querySelector("a.state-published")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = + article.querySelector(".searchresultdate")?.text.trim() ?? ""; + var excerpt = + article.querySelector(".croppedDescription")?.text ?? ""; + var url = article + .querySelector("a.state-published") + ?.attributes["href"] ?? + ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: searchQuery, + ), + ); + } + } + }, + ); + + return articles; + } +} diff --git a/lib/extractor/general/national/myanmar/rfa_burmese.dart b/lib/extractor/general/national/myanmar/rfa_burmese.dart new file mode 100644 index 0000000..b1d02e2 --- /dev/null +++ b/lib/extractor/general/national/myanmar/rfa_burmese.dart @@ -0,0 +1,175 @@ +import 'package:dio/dio.dart'; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'package:raven/utils/time.dart'; + +class RfaBurmese extends Publisher { + @override + String get name => "မြန်မာဌာန"; + + @override + String get homePage => "https://www.rfa.org/burmese"; + + @override + Future> get categories async { + Map map = {"Burmese": "burmese"}; + await dio().get(homePage, options: options).then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var elements = document.querySelectorAll(".header_top li a"); + for (var element in elements) { + if(element.attributes["href"]==homePage) { + continue; + } + map.putIfAbsent( + element.text, + () { + return element.attributes["href"]!.replaceAll("$homePage/", ""); + }, + ); + } + } + }, + ); + return map..removeWhere((key, value) => value=="video",); + } + + @override + Category get mainCategory => Category.world; + + @override + bool get hasSearchSupport => true; + + Options options = Options(headers: { + "User-Agent": + "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }); + + @override + Future article(NewsArticle newsArticle) async { + await dio().get(newsArticle.url, options: options).then((response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var content = document.querySelector('#storytext')?.text ?? ""; + var author = document.querySelector("#story_byline")?.text ?? ""; + var thumbnail = document.querySelector("#headerimg img")?.text ?? ""; + + newsArticle = newsArticle.fill( + content: content, + author: author, + tags: [], + thumbnail: thumbnail, + ); + } + }); + + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "news", + int page = 1, + }) async { + Set articles = {}; + var limit = 15; + var offset = (page - 1) * limit; + + await dio().get( + "$homePage/$category/story_archive?b_start:int=$offset", + options: options, + ) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".sectionteaser"); + for (var article in data) { + var title = article.querySelector("span")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = article.querySelector(".story_date")?.text ?? ""; + var excerpt = + article.querySelector("story_description")?.text ?? ""; + var url = article.querySelector("a")?.attributes["href"] ?? ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: category, + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + Set articles = {}; + var limit = 30; + var offset = (page - 1) * limit; + await dio() + .get( + "$homePage/@@search?SearchableText=$searchQuery&sort_on=Date&b_start:int=$offset", + options: options) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".searchresult"); + for (var article in data) { + var title = article.querySelector("a.state-published")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = + article.querySelector(".searchresultdate")?.text.trim() ?? ""; + var excerpt = + article.querySelector(".croppedDescription")?.text ?? ""; + var url = article + .querySelector("a.state-published") + ?.attributes["href"] ?? + ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: searchQuery, + ), + ); + } + } + }, + ); + + return articles; + } +} diff --git a/lib/extractor/general/world/rfa_english.dart b/lib/extractor/general/world/rfa_english.dart new file mode 100644 index 0000000..65bfe8e --- /dev/null +++ b/lib/extractor/general/world/rfa_english.dart @@ -0,0 +1,175 @@ +import 'package:dio/dio.dart'; +import 'package:html/parser.dart' as html_parser; +import 'package:raven/brain/dio_manager.dart'; +import 'package:raven/model/article.dart'; +import 'package:raven/model/publisher.dart'; +import 'package:raven/utils/time.dart'; + +class RfaEnglish extends Publisher { + @override + String get name => "Radio Free Asia"; + + @override + String get homePage => "https://www.rfa.org/english"; + + @override + Future> get categories async { + Map map = {"News": "news"}; + await dio().get(homePage, options: options).then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + document.querySelectorAll(".nav-items a").forEach((element) { + map.putIfAbsent( + element.text, + () { + return element.attributes["href"]!.replaceAll("$homePage/", ""); + }, + ); + }); + } + }, + ); + var unsupported = ["Press Room", "Contact", "Jobs and internships"]; + return map + ..removeWhere( + (key, value) => unsupported.contains(key), + ); + } + + @override + Category get mainCategory => Category.world; + + @override + bool get hasSearchSupport => true; + + Options options = Options(headers: { + "User-Agent": + "Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + }); + + @override + Future article(NewsArticle newsArticle) async { + await dio().get(newsArticle.url, options: options).then((response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var content = document.querySelector('#storytext')?.text ?? ""; + var author = document.querySelector("#story_byline")?.text ?? ""; + var thumbnail = document.querySelector("#headerimg img")?.text ?? ""; + + newsArticle = newsArticle.fill( + content: content, + author: author, + tags: [], + thumbnail: thumbnail, + ); + } + }); + + return newsArticle; + } + + @override + Future> categoryArticles({ + String category = "news", + int page = 1, + }) async { + Set articles = {}; + var limit = 15; + var offset = (page - 1) * limit; + + await dio().get( + "$homePage/$category/story_archive?b_start:int=$offset", + options: options, + ) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".sectionteaser"); + for (var article in data) { + var title = article.querySelector("span")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = article.querySelector(".story_date")?.text ?? ""; + var excerpt = + article.querySelector("story_description")?.text ?? ""; + var url = article.querySelector("a")?.attributes["href"] ?? ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: category, + ), + ); + } + } + }, + ); + + return articles; + } + + @override + Future> searchedArticles({ + required String searchQuery, + int page = 1, + }) async { + Set articles = {}; + var limit = 30; + var offset = (page - 1) * limit; + await dio() + .get( + "$homePage/@@search?SearchableText=$searchQuery&sort_on=Date&b_start:int=$offset", + options: options) + .then( + (response) { + if (response.statusCode == 200) { + var document = html_parser.parse(response.data); + var data = document.querySelectorAll(".searchresult"); + for (var article in data) { + var title = article.querySelector("a.state-published")?.text ?? ""; + var thumbnail = + article.querySelector("img")?.attributes["src"] ?? ""; + var publishedAt = + article.querySelector(".searchresultdate")?.text.trim() ?? ""; + var excerpt = + article.querySelector(".croppedDescription")?.text ?? ""; + var url = article + .querySelector("a.state-published") + ?.attributes["href"] ?? + ""; + + articles.add( + NewsArticle( + publisher: name, + title: title, + content: "", + excerpt: excerpt, + author: "", + url: url, + tags: [], + thumbnail: thumbnail, + publishedAt: stringToUnix(publishedAt, format: "yyyy-MM-dd"), + category: searchQuery, + ), + ); + } + } + }, + ); + + return articles; + } +} diff --git a/test/extractor/general/national/china/rfa_cantonese_test.dart b/test/extractor/general/national/china/rfa_cantonese_test.dart new file mode 100644 index 0000000..c1edc8a --- /dev/null +++ b/test/extractor/general/national/china/rfa_cantonese_test.dart @@ -0,0 +1,21 @@ +import 'package:raven/extractor/general/national/china/rfa_cantonese.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + +void main() { + Publisher publisher = RfaCantonese(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +} diff --git a/test/extractor/general/national/china/rfa_mandarin_test.dart b/test/extractor/general/national/china/rfa_mandarin_test.dart new file mode 100644 index 0000000..8b11750 --- /dev/null +++ b/test/extractor/general/national/china/rfa_mandarin_test.dart @@ -0,0 +1,21 @@ +import 'package:raven/extractor/general/national/china/rfa_mandarin.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + +void main() { + Publisher publisher = RfaMandarin(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +} diff --git a/test/extractor/general/national/china/rfa_tibetan_test.dart b/test/extractor/general/national/china/rfa_tibetan_test.dart new file mode 100644 index 0000000..98969e1 --- /dev/null +++ b/test/extractor/general/national/china/rfa_tibetan_test.dart @@ -0,0 +1,22 @@ +import 'package:raven/extractor/general/national/china/rfa_mandarin.dart'; +import 'package:raven/extractor/general/national/china/rfa_tibetan.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + +void main() { + Publisher publisher = RfaTibetan(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +} diff --git a/test/extractor/general/national/myanmar/rfa_burmese_test.dart b/test/extractor/general/national/myanmar/rfa_burmese_test.dart new file mode 100644 index 0000000..86a5226 --- /dev/null +++ b/test/extractor/general/national/myanmar/rfa_burmese_test.dart @@ -0,0 +1,23 @@ +import 'package:raven/extractor/general/national/china/rfa_mandarin.dart'; +import 'package:raven/extractor/general/national/china/rfa_tibetan.dart'; +import 'package:raven/extractor/general/national/myanmar/rfa_burmese.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../../common.dart'; + +void main() { + Publisher publisher = RfaBurmese(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +} diff --git a/test/extractor/general/world/rfa_english_test.dart b/test/extractor/general/world/rfa_english_test.dart new file mode 100644 index 0000000..8c68341 --- /dev/null +++ b/test/extractor/general/world/rfa_english_test.dart @@ -0,0 +1,21 @@ +import 'package:raven/extractor/general/world/rfa_english.dart'; +import 'package:test/test.dart'; +import 'package:raven/model/publisher.dart'; + +import '../../common.dart'; + +void main() { + Publisher publisher = RfaEnglish(); + + test('Extract Categories Test', () async { + await ExtractorTest.categoriesTest(publisher); + }); + + test('Category Articles Test', () async { + await ExtractorTest.categoryArticlesTest(publisher); + }); + + test('Search Articles Test', () async { + await ExtractorTest.searchedArticlesTest(publisher, 'world'); + }); +} From 33bbfbad5e973dcca9e4958b2c0b1836b54ce07a Mon Sep 17 00:00:00 2001 From: kshib Date: Mon, 8 Jul 2024 22:26:44 +0530 Subject: [PATCH 3/5] update prothamalo --- .../general/national/bangladesh/prothamalo.dart | 11 ++++------- .../{prothamalo_en.dart => prothamalo_english.dart} | 11 ++++------- 2 files changed, 8 insertions(+), 14 deletions(-) rename lib/extractor/general/national/bangladesh/{prothamalo_en.dart => prothamalo_english.dart} (95%) diff --git a/lib/extractor/general/national/bangladesh/prothamalo.dart b/lib/extractor/general/national/bangladesh/prothamalo.dart index 190b737..f8acfe0 100644 --- a/lib/extractor/general/national/bangladesh/prothamalo.dart +++ b/lib/extractor/general/national/bangladesh/prothamalo.dart @@ -13,7 +13,10 @@ class ProthamAlo extends Publisher { Future> get categories => extractCategories(); @override - Category get mainCategory => Category.india; + Category get mainCategory => Category.bangladesh; + + @override + bool get hasSearchSupport => true; Future> extractCategories() async { return { @@ -139,10 +142,4 @@ class ProthamAlo extends Publisher { }); return newsArticle; } - - @override - Future> articles( - {String category = "home", int page = 1}) async { - return super.articles(category: category, page: page); - } } diff --git a/lib/extractor/general/national/bangladesh/prothamalo_en.dart b/lib/extractor/general/national/bangladesh/prothamalo_english.dart similarity index 95% rename from lib/extractor/general/national/bangladesh/prothamalo_en.dart rename to lib/extractor/general/national/bangladesh/prothamalo_english.dart index 00dd2e5..eec51ad 100644 --- a/lib/extractor/general/national/bangladesh/prothamalo_en.dart +++ b/lib/extractor/general/national/bangladesh/prothamalo_english.dart @@ -13,7 +13,10 @@ class ProthamAloEn extends Publisher { Future> get categories => extractCategories(); @override - Category get mainCategory => Category.india; + Category get mainCategory => Category.bangladesh; + + @override + bool get hasSearchSupport => true; Future> extractCategories() async { return { @@ -137,10 +140,4 @@ class ProthamAloEn extends Publisher { }); return newsArticle; } - - @override - Future> articles( - {String category = "home", int page = 1}) async { - return super.articles(category: category, page: page); - } } From bb1ec1ff2764c47b35ae11200e2ef3aad7e6108a Mon Sep 17 00:00:00 2001 From: kshib Date: Mon, 8 Jul 2024 22:27:04 +0530 Subject: [PATCH 4/5] add new sources --- lib/model/publisher.dart | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/model/publisher.dart b/lib/model/publisher.dart index 84cf775..21459ba 100644 --- a/lib/model/publisher.dart +++ b/lib/model/publisher.dart @@ -1,16 +1,21 @@ import 'package:raven/extractor/custom/morss.dart'; import 'package:raven/extractor/custom/rss.dart'; import 'package:raven/extractor/general/national/bangladesh/prothamalo.dart'; -import 'package:raven/extractor/general/national/bangladesh/prothamalo_en.dart'; +import 'package:raven/extractor/general/national/bangladesh/prothamalo_english.dart'; +import 'package:raven/extractor/general/national/china/rfa_cantonese.dart'; +import 'package:raven/extractor/general/national/china/rfa_mandarin.dart'; +import 'package:raven/extractor/general/national/china/rfa_tibetan.dart'; import 'package:raven/extractor/general/national/india/thehindu.dart'; import 'package:raven/extractor/general/national/india/theindianexpress.dart'; import 'package:raven/extractor/general/national/india/thequint.dart'; import 'package:raven/extractor/general/national/india/thewire.dart'; +import 'package:raven/extractor/general/national/myanmar/rfa_burmese.dart'; import 'package:raven/extractor/general/world/aljazeera.dart'; import 'package:raven/extractor/general/world/apnews.dart'; import 'package:raven/extractor/general/world/bbc.dart'; import 'package:raven/extractor/general/world/cnn.dart'; import 'package:raven/extractor/general/world/reuters.dart'; +import 'package:raven/extractor/general/world/rfa_english.dart'; import 'package:raven/extractor/general/world/theguardian.dart'; import 'package:raven/extractor/technology/androidpolice.dart'; import 'package:raven/extractor/technology/arstechnica.dart'; @@ -34,6 +39,7 @@ Map publishers = { "morss": Morss(), "Protham Alo": ProthamAloEn(), "Reuters": Reuters(), + "Radio Free Asia": RfaEnglish(), "RSS Feed": RSSFeed(), "The Guardian": TheGuardian(), "The Hindu": TheHindu(), @@ -43,13 +49,24 @@ Map publishers = { "The Wire": TheWire(), "TorrentFreak": TorrentFreak(), "XDA Developers": XDAdevelopers(), + "প্রথম আলো": ProthamAlo(), + "မြန်မာဌာန": RfaBurmese(), + "RFA 自由亞洲電台粵語部": RfaCantonese(), + "自由亚洲电台": RfaMandarin(), + "ཨེ་ཤེ་ཡ་རང་དབང་རླུང་འཕྲིན་ཁང་": RfaTibetan(), }; enum Category { - world, technology, + world, + + // countries + bangladesh, + china, india, + + // misc custom, } From 62b40aaca590148b8a59a36050d889f705736397 Mon Sep 17 00:00:00 2001 From: kshib Date: Mon, 8 Jul 2024 22:27:42 +0530 Subject: [PATCH 5/5] add request retry --- lib/brain/dio_manager.dart | 17 ++++++++++++++++- pubspec.lock | 8 ++++++++ pubspec.yaml | 1 + 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/lib/brain/dio_manager.dart b/lib/brain/dio_manager.dart index 3f4a59a..d573ef4 100644 --- a/lib/brain/dio_manager.dart +++ b/lib/brain/dio_manager.dart @@ -1,5 +1,20 @@ import 'package:dio/dio.dart'; +import 'package:dio_smart_retry/dio_smart_retry.dart'; Dio dio() { - return Dio()..options = BaseOptions(validateStatus: (status) => true); + final dio_ = Dio(); + dio_.options = BaseOptions(validateStatus: (status) => true); + dio_.interceptors.add( + RetryInterceptor( + dio: dio_, + logPrint: print, + retries: 3, + retryDelays: const [ + Duration(seconds: 2), + Duration(seconds: 3), + Duration(seconds: 5), + ], + ), + ); + return dio_; } diff --git a/pubspec.lock b/pubspec.lock index ee5e62a..d168d06 100644 --- a/pubspec.lock +++ b/pubspec.lock @@ -294,6 +294,14 @@ packages: url: "https://pub.dev" source: hosted version: "3.2.2" + dio_smart_retry: + dependency: "direct main" + description: + name: dio_smart_retry + sha256: "3d71450c19b4d91ef4c7d726a55a284bfc11eb3634f1f25006cdfab3f8595653" + url: "https://pub.dev" + source: hosted + version: "6.0.0" dynamic_color: dependency: "direct main" description: diff --git a/pubspec.yaml b/pubspec.yaml index 8ecb5b3..d013424 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -14,6 +14,7 @@ dependencies: device_info_plus: ^10.1.0 dio: ^5.4.3+1 dio_cache_interceptor_hive_store: ^3.2.2 + dio_smart_retry: ^6.0.0 dynamic_color: ^1.7.0 flutter: sdk: flutter