Initial commit

angelmtenor · Nov 14, 2016 · 4091bd7 · 4091bd7
commit 4091bd7
Show file tree

Hide file tree

Showing 54 changed files with 6,996 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+__pycache__
+*.py~
+
+# *.so
+
+results
+old
+
+
+.idea
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1,34 @@
++--------------------------------------+---------------------------------------+
+|                                    ABOUT                                     |
++------------------------------------------------------------------------------+
+
+This file contains the list of people involved in the development of
+RL-ROBOT, which started at the
+Machine Perception and Intelligent Robotics (MAPIR) laboratory
+at the University of Malaga <http://mapir.isa.uma.es>
+
+If you feel someone is missing, please fork and pull-request.
+The following list is roughly sorted in reverse chronological order.
+
+
++--------------------------------------+---------------------------------------+
+|                          DEVELOPERS & CONTRIBUTORS                           |
++------------------------------------------------------------------------------+
+
+*	Angel Martinez-Tenor
+    [email protected]
+    http://mapir.isa.uma.es/mapirwebsite/index.php/people/115-people/230-angel-martinez-tenor
+    main developer
+
+
++--------------------------------------+---------------------------------------+
+|                                    OTHERS                                    |
++------------------------------------------------------------------------------+
+
+Bug reports and new feature suggestions, provided by users world-wide,
+will usually be mentioned in the changelog.
+
+We kindly thank all of them for this valuable feedback.
+
+
+                              --- END OF FILE ---
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
++--------------------------------------+---------------------------------------+
+|                                   LICENSE                                    |
++------------------------------------------------------------------------------+
+
+*	RL-ROBOT is released under a GPLv3 license. Read license-GPLv3,
+	or if not present, <http://www.gnu.org/licenses/>.
+
+*	For a closed-source version of RL-ROBOT
+	for commercial purposes, please contact the authors.
+
+*	If you use RL-ROBOT in an academic work,
+	please cite the most relevant publication associated by visiting:
+	<http://mapir.isa.uma.es>, or if any, please cite the
+	Machine Perception and Intelligent Robotics (MAPIR)
+	research group directly.
+
+
+
+                              --- END OF FILE ---
diff --git a/README.md b/README.md
@@ -0,0 +1,31 @@
+# RL-ROBOT
+This repository provides a Reinforcement Learning framework in Python from the Machine Perception and Intelligent Robotics research group [(MAPIR)](http://mapir.isa.uma.es).
+
+### Requirements
+* Python 3
+* numpy
+* matplotlib
+* tkinter   `sudo apt-get install python-tk`
+
+
+### V-REP settings: 
+(Tested on V-REP_PRO_EDU_V3_3_2 64_bits Linux)
+
+1. Use default values of `remoteApiConnections.txt`
+    ~~~
+    portIndex1_port 		= 19997
+    portIndex1_debug 		= false
+    portIndex1_syncSimTrigger 	= true
+    ~~~
+
+2. Activate threaded rendering (recommended):
+    `system/usrset.txt -> threadedRenderingDuringSimulation = 1` 
+
+**Execute V-REP** (`./vrep.sh on linux`). `File -> Open Scene -> (open any scene for RL-ROS)` 
+
+Recommended simulation settings for scenes in RL-ROS (already set in the provide ones):
+
+* Simulation step time: 50 ms  (default) 
+* Real-Time Simulation: Enabled
+* Multiplication factor: 3.00 (required CPU >= i3 3110m)
+ 
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#   +-----------------------------------+-----------------------------------+
+#   |                                 RL-ROBOT                              |
+#   |                                                                       |
+#   | Copyright (c) 2016, Individual contributors, see AUTHORS file.        |
+#   | Machine Perception and Intelligent Robotics (MAPIR),                  |
+#   | University of Malaga. <http://mapir.isa.uma.es>                       |
+#   |                                                                       |
+#   | This program is free software: you can redistribute it and/or modify  |
+#   | it under the terms of the GNU General Public License as published by  |
+#   | the Free Software Foundation, either version 3 of the License, or     |
+#   | (at your option) any later version.                                   |
+#   |                                                                       |
+#   | This program is distributed in the hope that it will be useful,       |
+#   | but WITHOUT ANY WARRANTY; without even the implied warranty of        |
+#   | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         |
+#   | GNU General Public License for more details.                          |
+#   |                                                                       |
+#   | You should have received a copy of the GNU General Public License     |
+#   | along with this program. If not, see <http://www.gnu.org/licenses/>.  |
+#   +-----------------------------------------------------------------------+
+""" RL-ROS init """
diff --git a/action_qbiasr.py b/action_qbiasr.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+#   +-----------------------------------------------+
+#   | RL-ROBOT. Reinforcement Learning for Robotics |
+#   | Angel Martinez-Tenor                          |
+#   | MAPIR. University of Malaga. 2016             |
+#   +-----------------------------------------------+
+""" Q-Biased Softmax Regression (QBIASR) v0.8.7 optimized """
+
+import math
+import random
+from functools import reduce
+from itertools import combinations
+
+import numpy as np
+
+import agent
+import exp
+import lp
+import task
+
+DEFAULT_TEMPERATURE = exp.TEMPERATURE
+temperature = DEFAULT_TEMPERATURE
+
+control_sequence = np.full(0, -1, dtype=np.int32)
+rewards_sequence = np.full(0, -1, dtype=np.float32)
+
+mix = np.full(0, -1, dtype=np.int)
+comb = np.full(0, -1, dtype=np.int)
+initiated = False
+
+
+def setup():
+    """ Initializes QBIASR """
+    global control_sequence, rewards_sequence, comb, mix, initiated
+
+    # size_sequence = size of eli queue: n < log(threshold) / log(gamma*lambda)
+    threshold = 0.01
+    size_sequence = int(math.log(threshold) / math.log(exp.GAMMA * exp.LAMBDA))
+
+    # size_sequence limits: [4, n_states/4]
+    lower_limit = 4  # Mandatory
+    upper_limit = int(task.n_states / 4)
+    if size_sequence > upper_limit:
+        size_sequence = upper_limit
+    if size_sequence < lower_limit:
+        size_sequence = lower_limit
+    control_sequence = np.full(size_sequence, -1, dtype=np.int32)
+    rewards_sequence = np.full(size_sequence, -1, dtype=np.float32)
+
+    # Create mix[s], index[s], subrow[s]
+    n_inputs = task.n_inputs
+    n_states = task.n_states
+
+    comb = np.array(list(combinations(range(n_inputs), n_inputs - 1)),
+                    dtype=np.int16)
+    # comb = len(list(combinations(range(n_inputs), n_inputs - 1))) = n_inputs!!
+    mix = np.full([n_states, n_inputs, n_states], -1, dtype=np.int)
+    index = np.full(([n_states, n_inputs, n_states]), -1, dtype=np.int)
+
+    for s in range(n_states):
+        ss = agent.unwrap_state(s)
+
+        for i in range(ss.size):
+            j = ss[i]
+            n = agent.cont_VAR[i, j]
+            for k in range(n):
+                index[s, i, k] = agent.VAR[i, j, k]
+
+        for idx, item in enumerate(comb):
+            matches = reduce(np.intersect1d, (index[s, item]))
+            mix[s, idx, 0:len(matches)] = matches
+    initiated = True
+
+
+def custom_softmax(input_array, temp):
+    """ Softmax Boltzmann action selection given a vector and temperature """
+    selected_action = -1
+    # 1: Get the probabilities
+    _input_array_size = len(input_array)
+    _Pa = np.zeros(_input_array_size)
+
+    for i in range(_input_array_size):
+        _Pa[i] = math.exp(input_array[i] / temp)
+    _Pa = np.divide(_Pa, sum(_Pa))
+
+    # 2: Select the action
+    ran = random.random()
+    accum = 0.0
+    for i in range(_input_array_size):
+        accum = accum + _Pa[i]
+        if ran < accum:
+            selected_action = i
+            break
+    assert (selected_action > -1)
+    return selected_action
+
+
+def select_biased_action(s):
+    """ Select an action 'a' given state 's' by QBIASR """
+    assert initiated, " QBIASR not initiated! setup() must be called previously"
+
+    # n_combinations = math.factorial(N_INPUTS)/(math.factorial(level)*
+    #  math.factorial(N_INPUTS-level))
+    n_actions = task.n_actions
+    q = lp.q
+    q_limit = lp.q_limit
+    bias_s = 0
+    for c in range(len(comb)):
+        s_array = mix[s, c]
+        s_array = s_array[s_array >= 0]
+        subrow = np.zeros((len(s_array), n_actions))
+        for idx, item in enumerate(s_array):
+            subrow[idx] = q[item]
+        # for k in range(len(s_array)):
+        #    subrow[k] = q[s_array[k]]
+        aux = np.average(subrow, 0)
+        bias_s += aux / len(comb)
+
+    low_reward_loop_evasion(s)
+    q_s_bias = q[s] + bias_s  # q_s_bias = np.sum([q[s], bias_s], axis=0)
+
+    # 2016_05_26: Temporal qs_bias row is normalized for softmax regression.
+    #  Standard q_limit: 100 (e.g: Rmax=10, GAMMA=0.9)
+    q_s_bias *= 100.0 / q_limit
+    selected_action = custom_softmax(tuple(q_s_bias), temperature)
+    return selected_action
+
+
+def low_reward_loop_evasion(s):
+    """ Increase the temperature if the agent is stuck in a sequence of states
+    with negative average reward """
+    global temperature
+    global control_sequence
+    global rewards_sequence
+
+    size_sequence = control_sequence.size
+
+    # early steps of learning:
+    if lp.step < size_sequence:
+        temperature = DEFAULT_TEMPERATURE
+        return
+
+    control_sequence = lp.sasr_step[lp.step - size_sequence:lp.step, 0]
+    # different state reached:
+    if s not in control_sequence:
+        temperature = DEFAULT_TEMPERATURE
+        return
+
+    # not enough repeated states:
+    unique_sequence = np.unique(control_sequence)
+    loop_rate = control_sequence.size / unique_sequence.size
+    if loop_rate <= 2:
+        temperature = DEFAULT_TEMPERATURE
+        return
+
+    # average reward positive:
+    rewards_sequence = lp.sasr_step[lp.step - size_sequence:lp.step, 3]
+    if np.average(rewards_sequence) > 0:
+        temperature = DEFAULT_TEMPERATURE
+        return
+
+    # low reward loop detected. Evasion:
+    temperature += 0.25 * loop_rate
+    if temperature > 50:
+        temperature = 50
+    # print(" Local maximum detected at: ",str(s_unique))
+    # print(" Temperature changed to: %0.2f" %temperature)
+    return