From 813ceaae88f720eacb5e19f157f8686b7373a579 Mon Sep 17 00:00:00 2001 From: Milos Marinkovic Date: Mon, 5 Sep 2016 11:13:57 +0200 Subject: [PATCH] Updating versioning and descriptions --- .gitignore | 1 + README.md | 16 ++++++++++++++++ app/build.gradle | 4 ++-- goose/build.gradle | 6 +++--- gradle.properties | 4 ++-- gradle/gradle-mvn-push.gradle | 18 +----------------- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 437b98c78..dc66badf4 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ captures/ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff: +.idea/* .idea/tasks.xml .idea/dictionaries .idea/vcs.xml diff --git a/README.md b/README.md index 223438c68..7a9841a12 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,22 @@ Since version `1.5.0`, Goose for Android also uses the official `HttpURLConnecti SSL libraries such as OpenSSL. A sample of how to use Goose for Android can be found in `DemoActivity.java` in the `app` folder source. +How it works +------------ +**Document cleaning** + +When you pass a URL to Goose, the first thing it starts to do is _clean up the document_ to make it easier to parse. It will go through the whole document and remove comments, common social network sharing elements, convert `em` and other tags to plain text nodes, try to convert `divs` used as text nodes to paragraphs, as well as do a general document cleanup (spaces, new lines, quotes, encoding, etc). + +**Content / Images Extraction** + +When dealing with random article links you're bound to come across the craziest of HTML files. Some sites even like to include 2 or more HTML files per site. Goose uses a scoring system based on clustering of English stop words and other factors that you can find in the code. Goose also does _descending scoring_ so as the nodes move down - the lower their scores become. The goal is to find the strongest grouping of text nodes inside a parent container and assume that's the relevant group of content as long as it's high enough (up) on the page. + +Image extraction is the one that takes the longest. Trying to find the most important image on a page proved to be challenging and required to download all the images to manually inspect them using external tools (not all images are considered, Goose checks mime types, dimensions, byte sizes, compression quality, etc). Java's Image functions were just too unreliable and inaccurate. On Android, Goose uses the `BitmapFactory` class, it is well documented, tested, and is fast and accurate. Images are analyzed from the top node that Goose finds the content in, then comes a recursive run outwards trying to find good images - Goose also checks if those images are ads, banners or author logos, and ignores them if so. + +**Output Formatting** + +Once Goose has the top node where we think the content is, Goose will try to format the content of that node for the output. For example, for NLP-type applications, Goose's output formatter will just suck all the text and ignore everything else, and other (custom) extractors can be built to offer a more Flipboardy-type experience. + Requirements ------------ - Android 4.0 or later (Minimum SDK level 14) diff --git a/app/build.gradle b/app/build.gradle index b048190fb..f4f352d23 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -8,8 +8,8 @@ android { applicationId "me.angrybyte.goosedemo" minSdkVersion 14 targetSdkVersion 24 - versionCode 4 - versionName "1.3" + versionCode 5 + versionName "1.5.1" } lintOptions { checkReleaseBuilds = false diff --git a/goose/build.gradle b/goose/build.gradle index 13e10b4ab..0362b0348 100644 --- a/goose/build.gradle +++ b/goose/build.gradle @@ -20,7 +20,7 @@ ext { siteUrl = 'https://github.com/milosmns/goose' gitUrl = 'https://github.com/milosmns/goose.git' - libraryVersion = '1.5.0' + libraryVersion = '1.5.1' developerId = 'milosmns' developerName = 'Milos Marinkovic' @@ -38,8 +38,8 @@ android { defaultConfig { minSdkVersion 14 targetSdkVersion 24 - versionCode 10 - versionName "1.5.0" + versionCode 11 + versionName "1.5.1" } buildTypes { release { diff --git a/gradle.properties b/gradle.properties index f2378aeaa..ec36a3345 100644 --- a/gradle.properties +++ b/gradle.properties @@ -18,7 +18,7 @@ # org.gradle.parallel=true GROUP=me.angrybyte.goose -VERSION_NAME=1.5.0 +VERSION_NAME=1.5.1 POM_DESCRIPTION=Goose - Article Extractor for Android @@ -34,4 +34,4 @@ POM_ALL_LICENCES=['Apache-2.0'] POM_LICENCE_DIST=repo POM_DEVELOPER_ID=milosmns -POM_DEVELOPER_NAME=Milos Marinkovic \ No newline at end of file +POM_DEVELOPER_NAME=Milos Marinkovic diff --git a/gradle/gradle-mvn-push.gradle b/gradle/gradle-mvn-push.gradle index 9b7cddb9a..2022e3b7b 100644 --- a/gradle/gradle-mvn-push.gradle +++ b/gradle/gradle-mvn-push.gradle @@ -1,19 +1,3 @@ -/* - * Copyright 2013 Chris Banes - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - apply plugin: 'com.jfrog.bintray' apply plugin: 'com.github.dcendents.android-maven' @@ -178,4 +162,4 @@ artifacts { archives sourcesJar archives javadocJar } -} \ No newline at end of file +}