Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a new proxy provider named effectiveProxyProvider #819

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package us.codecraft.webmagic.proxy;

import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;

/**
* a ProxyProvider can remove invalid proxy and add newProxies dynamically <br><br>
* New feature: <br><br>
* 1. remove invalid proxy
* 2. async add proxies when proxy is less than threshold
*
* @author evan
*/
public abstract class EffectiveProxyProvider implements ProxyProvider {

public static final int DEFAULT_EXPAND_POOL_SIZE = 1;

private final ConcurrentLinkedQueue<Proxy> validProxyQueue = new ConcurrentLinkedQueue<Proxy>();

private final ExecutorService addProxyPool = Executors.newFixedThreadPool(1);;

private final ReentrantLock addProxyLock = new ReentrantLock();

private final ReentrantLock pollAndOfferLock = new ReentrantLock();

private int expandPoolSize = DEFAULT_EXPAND_POOL_SIZE;

private ProxyPageValidator proxyPageValidator;

public EffectiveProxyProvider() {
}

public EffectiveProxyProvider(ProxyPageValidator proxyPageValidator) {
validProxyQueue.addAll(addProxies());
this.proxyPageValidator = proxyPageValidator;
}

public EffectiveProxyProvider(ProxyPageValidator proxyPageValidator, int expandPoolSize) {
validProxyQueue.addAll(addProxies());
this.proxyPageValidator = proxyPageValidator;
this.expandPoolSize = expandPoolSize;
}

public EffectiveProxyProvider(ProxyPageValidator pageValidator, List<Proxy> proxies) {
this.validProxyQueue.addAll(proxies);
this.proxyPageValidator = pageValidator;
}

public EffectiveProxyProvider(ProxyPageValidator pageValidator, List<Proxy> proxies, int expandPoolSize) {
this.validProxyQueue.addAll(proxies);
this.proxyPageValidator = pageValidator;
this.expandPoolSize = expandPoolSize;
}

@Override
public Proxy getProxy(Task task) {

//make atomic poll and offer
pollAndOfferLock.lock();
try {
Proxy proxy = validProxyQueue.poll();
if (proxy != null) {
//put tail realize loop
validProxyQueue.offer(proxy);
}

//get more proxies when queue capacity less than expect
if (validProxyQueue.size() <= expandPoolSize) {
expand();
}
return proxy;

}finally {
pollAndOfferLock.unlock();
}
}

//async addProxy and avoid invoke extra times
public void expand(){

if (addProxyLock.tryLock()) {
try {
addProxyPool.submit(new Runnable() {
@Override public void run() {
List<Proxy> newProxies = addProxies();
if (CollectionUtils.isNotEmpty(newProxies)) {
validProxyQueue.addAll(newProxies);
}
}
});
} finally {
addProxyLock.unlock();
}
}
}

@Override public void returnProxy(Proxy proxy, Page page, Task task) {
//remove it when proxy is invalid
if (proxyPageValidator != null && !proxyPageValidator.proxyValid(proxy, page, task)) {
validProxyQueue.remove(proxy);
}
}

public abstract List<Proxy> addProxies();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package us.codecraft.webmagic.proxy;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Task;

/**
* Created by evan on 2018/8/6.
*/
public interface ProxyPageValidator {

boolean proxyValid(Proxy proxy, Page page, Task task);

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package us.codecraft.webmagic.proxy;

import com.google.common.collect.Lists;
import java.util.List;
import java.util.Random;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;

/**
* Created by evan on 2018/8/6.
*/
public class EffectiveProxyProviderTest {

public static final Task TASK = Site.me().toTask();

public static final Page page = new Page();

@Test
public void test_get_proxy_process() throws Exception {
while (true) {
Proxy proxy = effectiveProxyProvider.getProxy(TASK);
System.out.println("get proxy :"+proxy.getHost());
effectiveProxyProvider.returnProxy(proxy, page, TASK);
Thread.sleep(2000);
}
}

private ProxyPageValidator proxyPageValidator = new ProxyPageValidator() {
@Override public boolean proxyValid(Proxy proxy, Page page, Task task) {

Random random = new Random();
if (random.nextInt(10) < 5) {
System.out.println("===remove===" + proxy.getHost());
return false;
}
return true;
}
};

private EffectiveProxyProvider effectiveProxyProvider = new EffectiveProxyProvider(proxyPageValidator, Lists.newArrayList(new Proxy("127.0.0.1", 1121))) {
@Override public List<Proxy> addProxies() {
System.out.println("===Expand===");
return Lists.newArrayList(getRandomProxy(3));
}
};

private List<Proxy> getRandomProxy(int count){
Random random = new Random();
List<Proxy> temp = Lists.newArrayList();
for (int i = 0;i<count;i++) {
temp.add(new Proxy("127.0.0." + random.nextInt(255), 123));
}
return temp;
}

}