Skip to content

Commit

Permalink
add indianexpress
Browse files Browse the repository at this point in the history
  • Loading branch information
ksh-b committed Apr 14, 2024
1 parent 1ec999e commit 01fb89a
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 0 deletions.
121 changes: 121 additions & 0 deletions lib/extractor/general/national/india/theindianexpress.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import 'package:html/dom.dart';
import 'package:raven/model/article.dart';
import 'package:raven/model/publisher.dart';
import 'dart:convert';
import 'package:http/http.dart' as http;
import 'package:html/parser.dart' as html_parser;
import 'package:raven/utils/time.dart';

class TheIndianExpress extends Publisher {
@override
String get name => "The Indian Express";

@override
String get homePage => "https://indianexpress.com";

@override
Future<Map<String, String>> get categories => extractCategories();

@override
Category get mainCategory => Category.india;

@override
bool get hasSearchSupport => false;

Future<Map<String, String>> extractCategories() async {
Map<String, String> map = {};
var response = await http.get(Uri.parse(homePage));
if (response.statusCode == 200) {
var document = html_parser.parse(utf8.decode(response.bodyBytes));
document
.querySelectorAll("#navbar a")
.sublist(1)
.forEach((element) {
map.putIfAbsent(
element.text.trim(),
() {
return element.attributes["href"]!
.replaceFirst(homePage, "");
},
);
});
}
var unsupported = ["Opinion", "Explained", "Entertainment", "Tech", "Research", "Videos"];
return map..removeWhere((key, value) => !value.contains("section") || unsupported.contains(key));
}

@override
Future<NewsArticle> article(NewsArticle newsArticle) async {
var response = await http.get(Uri.parse("$homePage${newsArticle.url}"));
if (response.statusCode == 200) {
Document document = html_parser.parse(utf8.decode(response.bodyBytes));
var content = document.querySelector("#pcl-full-content")?.innerHtml ?? "";
var thumbnail = document.querySelector(".custom-caption img")?.attributes["content"];
var excerpt = document.querySelector(".synopsis")?.text;
var timestamp = document.querySelector("span[itemprop=dateModified]")?.attributes["content"];
var tags = document.querySelectorAll(".m-breadcrumb a").sublist(1).map((e) => e.text).toList();
return newsArticle.fill(
content: content,
thumbnail: thumbnail,
excerpt: excerpt,
publishedAt: parseDateString(timestamp??"", format: "yyyy-MM-ddTHH:mm:ssZ"),
tags: tags,
);
}
return newsArticle;
}

@override
Future<Set<NewsArticle>> categoryArticles({
String category = "/",
int page = 1,
}) async {
if(category=="/")
category = "/latest-news";

Set<NewsArticle> articles = {};
String url = "$homePage$category/page/$page";

final response = await http.get(Uri.parse(url));
if (response.statusCode == 200) {
Document document = html_parser.parse(utf8.decode(response.bodyBytes));
List<Element> articleElements = [];
if(page==1)
articleElements = document.querySelectorAll(".swiper-slide") + document.querySelectorAll(".nation .articles");
else
articleElements = document.querySelectorAll(".nation .articles");
for (var article in articleElements) {
List<String> tags = [];
var title = article.querySelector("a")?.text.trim() ?? "";
if (title.trim().isEmpty)
title = article.querySelector("h2 a")?.text.trim() ?? "";
var articleUrl = article.querySelector("a")?.attributes["href"] ?? "";
var thumbnail = article.querySelector("img")?.attributes["src"];
var timestamp = article.querySelector(".date")?.text.replaceAll(" ", " ").trim() ?? "";
var excerpt = article.querySelector(".date+p")?.text ?? "";
timestamp = timestamp.contains("Updated:")?timestamp.split("Updated:")[1].trim():timestamp;
articles.add(NewsArticle(
publisher: this,
title: title ?? "",
content: "",
excerpt: excerpt,
author: "",
url: articleUrl.replaceFirst(homePage, ""),
tags: tags,
thumbnail: thumbnail ?? "",
publishedAt: parseDateString(timestamp, format: "MMMM d, yyyy HH:mm z"),
category: category),
);
}
}
return articles;
}

@override
Future<Set<NewsArticle>> searchedArticles({
required String searchQuery,
int page = 1,
}) async {
return {};
}
}
2 changes: 2 additions & 0 deletions lib/model/publisher.dart
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import 'package:raven/extractor/general/national/india/thehindu.dart';
import 'package:raven/extractor/general/national/india/theindianexpress.dart';
import 'package:raven/extractor/general/national/india/thequint.dart';
import 'package:raven/extractor/general/national/india/thewire.dart';
import 'package:raven/extractor/general/world/aljazeera.dart';
Expand Down Expand Up @@ -26,6 +27,7 @@ Map<String, Publisher> publishers = {
"Reuters": Reuters(),
"The Guardian": TheGuardian(),
"The Hindu": TheHindu(),
"The Indian Express": TheIndianExpress(),
"The Verge": TheVerge(),
"The Quint": TheQuint(),
"The Wire": TheWire(),
Expand Down
22 changes: 22 additions & 0 deletions test/extractor/general/national/india/theindianexpress_test.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import 'package:raven/extractor/general/national/india/theindianexpress.dart';
import 'package:test/test.dart';
import 'package:raven/model/publisher.dart';

import '../../../common.dart';


void main() {
Publisher publisher = TheIndianExpress();

test('Extract Categories Test', () async {
await ExtractorTest.categoriesTest(publisher);
});

test('Category Articles Test', () async {
await ExtractorTest.categoryArticlesTest(publisher);
});

test('Search Articles Test', () async {
await ExtractorTest.searchedArticlesTest(publisher, 'world');
});
}

0 comments on commit 01fb89a

Please sign in to comment.