Skip to content

Migration workflow

Peter Weber edited this page Nov 5, 2024 · 275 revisions

Migration workflow steps

v1.23.0 --> v1.24.0

Migration index creation

  • add a PYTHONPATH='<path>/rero-ils-migration' variable in the docker-service.yaml file.
poetry run invenio reroils migrations index init 
# create the bramois migration project
poetry run invenio reroils migrations create bramois 64 -d " Migration données bibliothèque de Bramois" "rero_ils_migrations.converter_2024_bramois.BramoisConverter"
# convert the data to JSON
poetry run invenio reroils migrations data load bramois <path>/bramois.xml 
# run the dedup
poetry run invenio reroils migrations data dedup bramois

Migration permission

# update permissions
from rero_ils.modules.cli.fixtures import load_role_policies
from rero_ils.modules.cli.fixtures import load_system_role_policies
cfg = {
    "mig-search": [
    "pro_full_permissions",
    "pro_library_administrator",
    "pro_catalog_manager"
  ],
  "mig-read": [
    "pro_full_permissions",
    "pro_library_administrator",
    "pro_catalog_manager"
  ],
  "mig-access": [
    "pro_full_permissions",
    "pro_library_administrator",
    "pro_catalog_manager"
  ],
  "mig-update": [
    "pro_full_permissions",
    "pro_library_administrator",
    "pro_catalog_manager"
  ]
}
load_role_policies(cfg)

change Bibliomedia harvested

from rero_ils.modules.documents.api import Document, DocumentsSearch

query = (
    DocumentsSearch()
    .filter("exists", field="adminMetadata")
    .filter("match", adminMetadata__note="Bibliomedia")
    .exclude("term", harvested=True)
)
doc_pids = [hit.pid for hit in query.source('pid').scan()]
for pid in doc_pids:
    if doc := Document.get_record_by_pid(pid):
        change = False
        admin_metadata = doc.get('adminMetadata')
        new_note = []
        changed = False
        # print('---->', admin_metadata.get('note', []))
        for note in admin_metadata.get('note', []):
            if note.startswith('!!! Attention : notice Bibliomedia, ne pas se raccrocher !!!'):
                print(doc.pid, admin_metadata['note'])
                doc['harvested'] = True
                change = True
        if change:
            doc.update(data=doc, dbcommit=True, reindex=True)

Sync remote entities bf:Topic, bf:Temproral, bf:Place

from rero_ils.modules.entities.remote_entities.api import RemoteEntity
from rero_ils.modules.entities.remote_entities.sync import SyncEntity

sync = SyncEntity()
query = RemoteEntitiesSearch().filter("terms", type=["bf:Topic", "bf:Temporal", "bf:Place"])
count = query.count()
for idx, hit in enumerate(query.source("pid").scan(), 1):
    res = sync.sync_record(hit.pid)
    print(f"{idx:>7}/{count} {hit.pid} {res}")

Reindex Documents with remote entities

from rero_invenio_base.modules.tasks import run_on_worker
from rero_invenio_base.modules.utils import chunk
from rero_ils.modules.documents.api import Document, DocumentsSearch

code = '''
def reindex(_ids):
    from rero_ils.modules.documents.api import Document
    n = 0
    errors = []
    for _id in _ids:
        try:
            doc = Document.get_record(_id)
            doc.reindex()
            n += 1
        except Exception as e:
            print('error', e)
            errors.append(_id)
    return (n, errors)
'''

parallel = 7
# subjects
count = 0
query = DocumentsSearch().filter("exists", field="subjects.entity.pid")
ids = [hit.meta.id for hit in query.source().scan()]
for c in chunk([str(val) for val in ids], len(ids) // parallel):
    count += 1
    res = run_on_worker.delay(code, 'reindex', c)
    print('subjects', count, len(c), res)
# genreForm
count = 0
query = DocumentsSearch().filter("exists", field="genreForm.entity.pid")
ids = [hit.meta.id for hit in query.source().scan()]
for c in chunk([str(val) for val in ids], len(ids) // parallel):
    count += 1
    res = run_on_worker.delay(code, 'reindex', c)
    print('genreForm', count, len(c), res)

# from rero_ils.modules.documents.api import Document, DocumentsSearch
#
# for entity_type in ["subjects", "genreForm"]:
#     query = DocumentsSearch().filter("exists", field=f"{entity_type}.entity.pid")
#     count = query.count()
#     for idx, hit in enumerate(query.source("pid").scan(), 1):
#         if doc := Document.get_record_by_pid(hit.pid):
#             print(f"{idx:>7}/{count} {entity_type} {doc.pid}")
#             doc.reindex()

v1.22.1 --> v1.23.0

sync files

rsync -avr /network/nfs/files_prod/* /network/nfs/files/  --exclude=lost+found/

Clean RABBITMQ

# after docker compose down
sudo rm -rf /data/ils/prod/mq/*

copy static files

docker-compose exec -u root web-ui bash -c "cp -r /invenio/var/instance/static/* /invenio/instance/static/."

Alembic

poetry run invenio alembic upgrade

Update mapping

poetry run invenio rero es index update-mapping

Reindex Remote entities (~1h30)

from rero_ils.modules.entities.remote_entities.api import \
    RemoteEntitiesIndexer, RemoteEntity
from rero_ils.modules.tasks import process_bulk_queue

entities_ids = RemoteEntity.get_all_ids()
RemoteEntitiesIndexer().bulk_index(entities_ids)
process_bulk_queue.apply_async()

Reindex Operation Logs (<1h)

from invenio_search import current_search
# update operation logs templates
[p for p in current_search.put_templates()]

from invenio_search import current_search_client
def reindex(source, destination):
    res = current_search_client.reindex(
        body=dict(
            source=dict(
                index=source
            ),
            dest=dict(
                index=destination,
                version_type='external_gte'
            )
        ),
        wait_for_completion=False
    )
    return res['task']

def index_in_new(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(new_index_name, body=body)
        tasks.append(reindex(index_name, new_index_name))
    return tasks

def remove_old(indices):
    for index_name in indices:
        current_search_client.indices.delete(index_name)
        print(f'{index_name} has been deleted')

def rename_to_old(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(index_name, body=body)
        tasks.append(reindex(new_index_name, index_name))
    return tasks

# Execute one line after the other
# Get the list of the operation logs indices
indices = ['operation_logs-2024']
tasks = index_in_new(indices)
# Check for completion
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old(indices)
tasks = rename_to_old(indices)
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old([i+'-new' for i in indices])

Reharvest ebooks

poetry run invenio reroils oaiharvester harvest -n ebooks -q -f 1990-01-01

Enable task

poetry run invenio reroils scheduler enable_tasks -n automatic_renewal

v1.21.0 --> v1.22.1

Requirement

# create new frontend image with the new configuration
docker-compose build frontend
docker-compose stop selfcheck scheduler worker frontend

copy static files

docker-compose exec -u root web-ui bash -c "cp -r /invenio/var/instance/static/* /invenio/instance/static/."

Babeltheque

Set RERO_ILS_APP_BABELTHEQUE_ENABLED_VIEWS = ['vs']

Update mapping

Files

Set INVENIO_RERO_ILS_FILES_FOLDER = /network/nfs/files

# create new tables
poetry run invenio db create
# create new indices
poetry run invenio rero es index update-mapping
poetry run invenio index create  -b rero_ils/modules/files/mappings/v7/files/record-v1.0.0.json files-record-v1.0.0-20240521

# init storage
poetry run invenio files location create --default default /network/nfs/files
from invenio_search import current_search, current_search_client
current_search_client.indices.put_alias('files-record-v1.0.0-20240521', 'files')
current_search_client.indices.put_alias('files-record-v1.0.0-20240521', 'files-record-v1.0.0')

# update permissions
from rero_ils.modules.cli.fixtures import load_role_policies
from rero_ils.modules.cli.fixtures import load_system_role_policies
cfg = {
  "file-create": [
    "pro_full_permissions",
    "pro_catalog_manager",
    "pro_library_administrator"
  ],
  "file-update": [
    "pro_full_permissions",
    "pro_catalog_manager",
    "pro_library_administrator"
  ],
  "file-delete": [
    "pro_full_permissions",
    "pro_catalog_manager",
    "pro_library_administrator"
  ]
}
load_role_policies(cfg)

sys_cfg = {
  "file-search": [
    "any_user"
  ],
  "file-read": [
    "any_user"
  ],
  }
load_system_role_policies(sys_cfg)

Records

index_name=`poetry run invenio rero es index info -i records`
echo $index_name
poetry run invenio rero es index move records $index_name records-record-v1.0.0-20240521 -v
poetry run invenio index delete $index_name
from invenio_search import current_search, current_search_client
current_search_client.indices.put_alias('records-record-v1.0.0-20240521', 'records-record-v1.0.0')

Documents (20')

index_name=`poetry run invenio rero es index info -i documents`
echo $index_name
poetry run invenio rero es index move documents $index_name documents-document-v0.0.1-20240521 -v
poetry run invenio index delete $index_name

Fiction (4' + 13' + 15')

from sqlalchemy import func
from invenio_db import db
from rero_ils.modules.documents.api import Document, DocumentsSearch
from invenio_search import current_search_client
from time import sleep

def change(query, fiction_statement, delay=5):
    """change DB and ES."""
    print(f'Get ids ({query.count()}) ...')
    ids = [hit.meta.id for hit in query.source(False).scan()]

    print('Update DB ...', end=' ')
    count = Document.model_cls.query \
        .filter(Document.model_cls.id.in_(ids)) \
        .update(
            {
                'json': func.jsonb_set(
                    Document.model_cls.json,
                    '{fiction_statement}',
                    f'"{fiction_statement}"'
                ),
                'version_id': Document.model_cls.version_id + 1
            },
            synchronize_session=False
        )
    print(count)
    db.session.commit()

    body = query.to_dict()
    body.update(
        {"script": {
            "source": f"ctx._source['fiction_statement'] = '{fiction_statement}'"
        }}
    )

    info = current_search_client.update_by_query(
        index='documents',
        body=body,
        wait_for_completion=False
    )
    task_id = info['task']
    print(f'Update ES ... task id: "{task_id}"')
    task = current_search_client.tasks.get(task_id)
    print(
        f'updated: {task["task"]["status"]["updated"]} '
        f'conflicts: {task["task"]["status"]["version_conflicts"]}',
        end='\r'
    )
    while not task['completed']:
        sleep(delay)
        task = current_search_client.tasks.get(task_id)
        print(
            f'updated: {task["task"]["status"]["updated"]} '
            f'conflicts: {task["task"]["status"]["version_conflicts"]}',
            end='\r'
        )
    print()
    return task['response']


# if errors
failures = response.get('failures', [])
print(f'Correct errors: {len(failures)}')
for idx, failure in enumerate(failures, 1):
    id_ = failure.get('id')
    print(idx, id_, end='\r')
    current_search_client.delete(
        index='documents',
        id=id_,
        refresh=True
    )
print('Sleep: 60                              ')
sleep(60)
for idx, failure in enumerate(failures, 1):
    id_ = failure.get('id')
    print(idx, id_, end='\r')
    doc = Document.get_record(id_)
    try:
        doc.reindex()
    except Exception as err:
        print(idx, id_, err)
# if errors


FICTIONS_TERMS = ['Fictions', 'Films de fiction']
# Fiction
query = DocumentsSearch() \
    .filter('terms', facet_genre_form_en=FICTIONS_TERMS)
response = change(query, 'fiction')
# test
fiction_count = DocumentsSearch() \
    .filter('term', fiction_statement='fiction') \
    .count()
print(f'term: {query.count()} statement: {fiction_count}')

# Non fiction
query = DocumentsSearch() \
    .exclude('term', harvested=True) \
    .exclude('terms', facet_genre_form_en=FICTIONS_TERMS) \
    .filter('exists', field='subjects')
response = change(query, 'non_fiction')
# test
non_fiction_count = DocumentsSearch() \
    .filter('term', fiction_statement='non_fiction') \
    .count()
print(f'term: {query.count()} statement: {non_fiction_count}')

# Unspecified
query = DocumentsSearch() \
    .exclude('exists', field='fiction_statement')
db.session.close()
response = change(query, 'unspecified')
# test
count = DocumentsSearch().count() - fiction_count - non_fiction_count
unspecified_count = DocumentsSearch() \
    .filter('term', fiction_statement='unspecified') \
    .count()
print(f'{count} statement: {unspecified_count}')

Remove legacy fields (20')

Delete field legacy_circulation_rules from all items (https://github.com/rero/rero-ils/pull/3671).

from rero_ils.modules.items.api import ItemsSearch, Item
from invenio_search import current_search_client
from invenio_db import db
from time import sleep
from elasticsearch_dsl import Q


query = ItemsSearch() \
    .filter(
        Q('exists', field='legacy_circulation_rules')
    )

print(f'Get ids ({query.count()}) ...')
ids = [hit.meta.id for hit in query.source().scan()]

print('Change DB ... ', end='')
count = Item.model_cls.query \
    .filter(Item.model_cls.id.in_(ids)) \
    .update(
        {
            Item.model_cls.json: (
                Item.model_cls.json - 'legacy_circulation_rules'
            ),
            'version_id': Item.model_cls.version_id + 1
        },
        synchronize_session=False
    )
db.session.commit()
print(count)

print('Change ES ...')
body = query.to_dict()
body.update({"script" : "ctx._source.remove(\"legacy_circulation_rules\")"}, )

info = current_search_client.update_by_query(
    index='items',
    body=body,
    wait_for_completion=False
)
task_id = info['task']
print(f'Update ES ... task id: "{task_id}"')
task = current_search_client.tasks.get(task_id)
print(
    f'updated: {task["task"]["status"]["updated"]} '
    f'conflicts: {task["task"]["status"]["version_conflicts"]}',
    end='\r'
)
while not task['completed']:
    sleep(5)
    task = current_search_client.tasks.get(task_id)
    print(
        f'updated: {task["task"]["status"]["updated"]} '
        f'conflicts: {task["task"]["status"]["version_conflicts"]}',
        end='\r'
    )
print()

Alembic fiction

poetry run invenio alembic upgrade

poetry run invenio alembic stamp 2e97565eba72
poetry run invenio alembic upgrade
poetry run invenio alembic stamp 8ae99b034410
poetry run invenio alembic upgrade
poetry run invenio alembic stamp a29271fd78f8
poetry run invenio alembic upgrade

Update SLM

Add files* to slm_daily_all.json.

poetry run invenio rero es slm put daily slm_daily_all.json

v1.20.0 --> v1.21.0

Reindexing

Flask2

Flask-Wiki

  • add a INVENIO_WIKI_INDEX_DIR in the configmap of you producution environment.

migrate invenio-userprofle to user.user_profile

from invenio_db import db
from rero_ils.modules.patrons.api import Patron
db.session.execute("DELETE from alembic_version where version_num = 'c25ef2c50ffa'")
db.session.commit()
# run poetry run invenio alembic upgrade
# INFO  [alembic.runtime.migration] Context impl PostgresqlImpl.
# INFO  [alembic.runtime.migration] Will assume transactional DDL.
# INFO  [alembic.runtime.migration] Running upgrade 04480be1593e -> 842a62b56e60, Change FK AccountsRole to string (downgrade recipe).
# INFO  [alembic.runtime.migration] Running upgrade e12419831262 -> 999dcbd19ace, Add versioning information to models.
# INFO  [alembic.runtime.migration] Running upgrade 999dcbd19ace -> dfbdf43a3e96, Separate login info from user table.
# INFO  [alembic.runtime.migration] Running upgrade dfbdf43a3e96 -> 62efc52773d4, Create UserIdentity table.
# INFO  [alembic.runtime.migration] Running upgrade 62efc52773d4 -> eb9743315a9d, Add user profile and preferences as JSON fields to the User table.
# INFO  [alembic.runtime.migration] Running upgrade eb9743315a9d -> f2522cdd5fcd, Change AccountsRole primary key to string.
# INFO  [alembic.runtime.migration] Running upgrade f2522cdd5fcd, 842a62b56e60 -> f9843093f686, Change FK AccountsRole to string (upgrade recipe).
# INFO  [alembic.runtime.migration] Running upgrade f9843093f686 -> 037afe10e9ff, Add user moderation fields.
# INFO  [alembic.runtime.migration] Running upgrade bff1f190b9bd -> aaa265b0afa6, Move UserIdentity to accounts.
# INFO  [alembic.runtime.migration] Running upgrade  -> 759d47cbdba7, Create oaiserver branch.
# INFO  [alembic.runtime.migration] Running upgrade 759d47cbdba7 -> e655021de0de, Create oiaserver tables.
# INFO  [alembic.runtime.migration] Running upgrade e655021de0de -> 5d25c1981985, Add system_created field.
total = db.session.execute('SELECT COUNT(*) from userprofiles_userprofile').all()[0][0]
errors = {}
for idx, prof in enumerate(db.session.execute('SELECT * from userprofiles_userprofile').all(), 1):
    print(f'{idx}/{total}', end='\r')
    prof = {k: v for k, v in dict(prof).items() if v}
    user_id = prof.pop('user_id')
    user = Patron._get_user_by_user_id(user_id)
    if user.username:
        continue
    prof.pop('displayname', None)
    try:
        if birth_date := prof.get('birth_date'):
            prof['birth_date'] = birth_date.strftime('%Y-%m-%d')
        if username := prof.pop('username', None):
            user.username = username
        user.user_profile = prof
        db.session.merge(user)
        if idx % 100 == 0:
            db.session.commit()
    except Exception as e:
        errors[user_id] = e

db.session.commit()
if errors:
    print('errors:', errors)
else:
  db.session.execute('DROP TABLE userprofiles_userprofile')

Mapping

Elasticsearch

poetry run invenio rero es index update-mapping
index_name=`poetry run invenio rero es index info -i vendors`
echo $index_name
poetry run invenio rero es index move vendors $index_name vendors-vendor-v0.0.1-20240206 -v
poetry run invenio rero es index update-mapping
poetry run invenio index delete $index_name

index_name=`poetry run invenio rero es index info -i acq_orders`
echo $index_name
poetry run invenio rero es index move acq_orders $index_name acq_orders-acq_order-v0.0.1-20240206 -v
poetry run invenio rero es index update-mapping
poetry run invenio index delete $index_name

index_name=`poetry run invenio rero es index info -i documents`
echo $index_name
poetry run invenio rero es index move documents $index_name documents-document-v0.0.1-20240206 -v
poetry run invenio rero es index update-mapping
poetry run invenio index delete $index_name

Documents partOf

from rero_ils.modules.documents.api import Document, DocumentsSearch
from invenio_db import db

query = DocumentsSearch().filter('exists', field='partOf.numbering')
total = query.count()
for idx, hit in enumerate(query.source('pid').scan(), 1):
    print(f'{idx}/{total} {hit.pid}', end='\r')
    if doc := Document.get_record_by_pid(hit.pid):
        part_of = doc['partOf']
        for part_of in doc['partOf']:
            for numbering in part_of.get('numbering', []):
                if 'volume' in numbering:
                     numbering['volume'] = str(numbering['volume'])
                if 'issue' in numbering:
                     numbering['issue'] = str(numbering['issue'])
        # print(part_of)
        # TODO: make it faster
        # doc.update(data=doc, dbcommit=True, reindex=True)
        doc.model.json = doc
        db.session.merge(doc.model)
        doc.reindex()
        if idx % 100 == 0:
            db.session.commit()
db.session.commit()

v1.19.0 --> v1.20.0

Stats Cfg

# create stat cfg table
poetry run invenio db create
# create the index
poetry run invenio index create  -b rero_ils/modules/stats_cfg/mappings/v7/stats_cfg/stat_cfg-v0.0.1.json 'stats_cfg-stat_cfg-v0.0.1-20231121'
from invenio_search import current_search, current_search_client
# create the aliases
current_search_client.indices.put_alias('stats_cfg-stat_cfg-v0.0.1-20231121', 'stats_cfg')
current_search_client.indices.put_alias('stats_cfg-stat_cfg-v0.0.1-20231121', 'stats_cfg-stat_cfg-v0.0.1')

Operation Logs

# update operation logs templates
[p for p in current_search.put_templates()]
poetry run invenio rero es index update-mapping
from invenio_search import current_search_client
def reindex(source, destination):
    res = current_search_client.reindex(
        body=dict(
            source=dict(
                index=source
            ),
            dest=dict(
                index=destination,
                version_type='external_gte'
            )
        ),
        wait_for_completion=False
    )
    return res['task']

def index_in_new(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(new_index_name, body=body)
        tasks.append(reindex(index_name, new_index_name))
    return tasks

def remove_old(indices):
    for index_name in indices:
        current_search_client.indices.delete(index_name)
        print(f'{index_name} has been deleted')

def rename_to_old(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(index_name, body=body)
        tasks.append(reindex(new_index_name, index_name))
    return tasks

# Execute one line after the other
# Get the list of the operation logs indices
indices = list(current_search_client.indices.get_alias('operation_logs').keys())
tasks = index_in_new(indices)
# Check for completion
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old(indices)
tasks = rename_to_old(indices)
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old([i+'-new' for i in indices])

Roles

from invenio_access.models import Role, ActionRoles
from invenio_db import db
from rero_ils.modules.cli.fixtures import load_role_policies

r1 = [r for r in Role.query.all()][-1]
r1.name
r1.name = 'pro_statistic_manager'
db.session.merge(r1)
db.session.commit()

cfg = {
  "stat-access": [
    "pro_statistic_manager",
  ],
  "stat-search": [
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat-read": [
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-access": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-search": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-read": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-create": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-update": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ],
  "stat_cfg-delete": [
    "pro_full_permissions",
    "pro_statistic_manager",
    "pro_library_administrator"
  ]}
load_role_policies(cfg)

# removes some roles
role_id = Role.query.filter_by(name='pro_read_only').first().id
ar = ActionRoles.query.filter_by(action='stat-search').filter_by(role_id=role_id).first()
db.session.delete(ar)
ar = ActionRoles.query.filter_by(action='stat-read').filter_by(role_id=role_id).first()
db.session.delete(ar)
db.session.commit()

Enable tasks

TODO

v18.x --> v19.0 (In progress)

Mappings

Entities: delete old index (new: local and remote with alias)

poetry run invenio index create  -b rero_ils/modules/entities/remote_entities/mappings/v7/remote_entities/remote_entity-v0.0.1.json remote_entities-remote_entity-v0.0.1-20231031

Elasticvue REST

/_reindex
{
	"source": {
		"index": "entities-entity-v0.0.1-20230516"
	},
	"dest": {
		"index": "remote_entities-remote_entity-v0.0.1-20231031",
                "version_type": "external_gte"
	},
	"script": {
		"source": "ctx._source['$schema'] = \"https://bib.rero.ch/schemas/remote_entities/remote_entity-v0.0.1.json\", "ctx._source['resource_type'] = \"remote\"]"
	}
}
poetry run invenio rero es index update-mapping

#index_name=entities-entity-v0.0.1-20230516
#poetry run invenio index delete $index_name

poetry run invenio rero es alias put remote_entities-remote_entity-v0.0.1-20231031 remote_entities
poetry run invenio rero es alias put remote_entities-remote_entity-v0.0.1-20231031 remote_entities-remote_entity-v0.0.1
poetry run invenio rero es alias put remote_entities-remote_entity-v0.0.1-20231031 entities
poetry run invenio rero es alias put remote_entities-remote_entity-v0.0.1-20231031 entities-entity-v0.0.1

poetry run invenio index create  -b rero_ils/modules/entities/local_entities/mappings/v7/local_entities/local_entity-v0.0.1.json local_entities-local_entity-v0.0.1-20231031
poetry run invenio rero es alias put local_entities-local_entity-v0.0.1-20231031 local_entities
poetry run invenio rero es alias put local_entities-local_entity-v0.0.1-20231031 local_entities-local_entity-v0.0.1
poetry run invenio rero es alias put local_entities-local_entity-v0.0.1-20231031 entities
poetry run invenio rero es alias put local_entities-local_entity-v0.0.1-20231031 entities-entity-v0.0.1
index_name=`poetry run invenio rero es index info -i items`
echo $index_name
poetry run invenio rero es index move items $index_name items-item-v0.0.1-20231019
poetry run invenio index delete $index_name

poetry run invenio rero es index update-mapping

DB

rename entity remote_entity

su - postgres
psql reroils
ALTER TABLE entity_id RENAME TO remote_entity_id;
ALTER TABLE entity_metadata RENAME TO remote_entity_metadata;

id table rename ent -> rement

from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
PersistentIdentifier.query.filter_by(pid_type='ent').update({'pid_type': 'rement'})
db.session.commit()

Fixes entities jsonschema

from sqlalchemy import func
from rero_ils.modules.entities.remote_entities.api import RemoteEntity
from invenio_db import db
# Correct $schema for entities in db
old_schema = 'https://bib.rero.ch/schemas/entities/entity-v0.0.1.json'
schema = 'https://bib.rero.ch/schemas/remote_entities/remote_entity-v0.0.1.json'
count = RemoteEntity.model_cls.query.filter(RemoteEntity.model_cls.json['$schema'].as_string() == old_schema).update({"json": func.jsonb_set(RemoteEntity.model_cls.json, '{$schema}', f'"{schema}"')}, synchronize_session=False)

db.session.commit()
print(f"{count} updated entities")

If DB ES differences:

poetry run /network/nfs/data_ils/ils/scripts/correct_wrong_pids.py reroils utils correct-wrong-pids -t rement -v -c

create tables for local_entity

poetry run invenio db create

Reindex documents

from rero_invenio_base.modules.tasks import run_on_worker
from rero_invenio_base.modules.utils import chunk
from rero_ils.modules.documents.api import Document
code = '''
def reindex(_ids):
    from rero_ils.modules.documents.api import Document
    n = 0
    errors = []
    for _id in _ids:
        try:
            doc = Document.get_record(_id)
            doc.reindex()
            n += 1
        except Exception as e:
            print('error', e)
            errors.append(_id)
    return (n, errors)
'''

parallel = 7
count = 0
for c in chunk([str(val) for val in Document.get_all_ids()], Document.count() // parallel):
    count += 1
    res = run_on_worker.delay(code, 'reindex', c)
    print(count, len(c), res)

Stats

Re-generate incorrect circulation stats

TO-DO: FIX script for pricing stats, the incorrect numbers are computed since those stats seem to have no date_range!!!

import arrow
from dateutil.relativedelta import relativedelta
from datetime import datetime

from rero_ils.modules.stats.api.librarian import StatsForLibrarian
from rero_ils.modules.stats.api.pricing import StatsForPricing
from rero_ils.modules.stats.api.api import Stat, StatsSearch
from rero_ils.modules.libraries.api import LibrariesSearch

search = StatsSearch()\
    .filter('range', _created={'gte': '2023-07-30'})

for hit in list(search.source('pid').scan()):
    try:
        stat = Stat.get_record(hit.meta.id)
        if stat['type'] == 'billing':
            to_date = arrow.Arrow.fromdatetime(stat.created - relativedelta(days=1))
            compute = StatsForPricing(to_date=to_date)
            for val in stat.get('values', []):
                lib_pid = val['library']['pid']
                # number_of_checkouts
                new_number_of_checkouts = compute.number_of_circ_operations(lib_pid, 'checkout')
                print(lib_pid, val['number_of_checkouts'], new_number_of_checkouts)
                val['number_of_checkouts'] = new_number_of_checkouts
                # number_of_renewals
                new_number_of_renewals = compute.number_of_circ_operations(lib_pid, 'extend')
                print(lib_pid, val['number_of_renewals'], new_number_of_renewals)
                val['number_of_renewals'] = new_number_of_renewals
                # number_of_checkins
                new_number_of_checkins = compute.number_of_circ_operations(lib_pid, 'checkin')
                print(lib_pid, val['number_of_checkins'], new_number_of_checkins)
                val['number_of_checkins'] = new_number_of_checkins
                # number_of_ill_requests
                new_number_of_ill_requests = compute.number_of_ill_requests(lib_pid, ['denied'])
                print(lib_pid, val['number_of_validated_ill_requests'], new_number_of_ill_requests)
                val.pop('number_of_validated_ill_requests')
                val['number_of_ill_requests'] = new_number_of_ill_requests
                # number_of_requests
                new_number_of_requests = compute.number_of_circ_operations(lib_pid, 'request')
                print(lib_pid, val['number_of_requests'], new_number_of_requests)
                val['number_of_requests'] = new_number_of_requests
        elif stat['type'] == 'librarian':
            compute = StatsForLibrarian()
            compute.date_range = stat['date_range']
            for val in stat.get('values', []):
                lib_pid = val['library']['pid']
                # checkouts_for_transaction_library
                new_checkouts_for_transaction_library = compute.checkouts_for_transaction_library(lib_pid)
                print(lib_pid, val['checkouts_for_transaction_library'], new_checkouts_for_transaction_library)
                val['checkouts_for_transaction_library'] = new_checkouts_for_transaction_library
                # checkouts_for_owning_library
                new_checkouts_for_owning_library = compute.checkouts_for_owning_library(lib_pid)
                print(lib_pid, val['checkouts_for_owning_library'], new_checkouts_for_owning_library)
                val['checkouts_for_owning_library'] = new_checkouts_for_owning_library
                # active_patrons_by_postal_code
                new_active_patrons_by_postal_code = compute.active_patrons_by_postal_code(lib_pid)
                print(lib_pid, val['active_patrons_by_postal_code'], new_active_patrons_by_postal_code)
                val['active_patrons_by_postal_code'] = new_active_patrons_by_postal_code
                # new_active_patrons_by_postal_code
                new_new_active_patrons_by_postal_code = compute.active_patrons_by_postal_code(lib_pid, new_patrons=True)
                print(lib_pid, val['new_active_patrons_by_postal_code'], new_new_active_patrons_by_postal_code)
                val['new_active_patrons_by_postal_code'] = new_new_active_patrons_by_postal_code
                # renewals
                new_renewals = compute.renewals(lib_pid)
                print(lib_pid, val['renewals'], new_renewals)
                val['renewals'] = new_renewals
                # loans_of_transaction_library_by_item_location
                new_loans_of_transaction_library_by_item_location = compute.loans_of_transaction_library_by_item_location(lib_pid)
                print(lib_pid, val['loans_of_transaction_library_by_item_location'], new_loans_of_transaction_library_by_item_location)
                val['loans_of_transaction_library_by_item_location'] = new_loans_of_transaction_library_by_item_location

        stat.update(stat, commit=True, dbcommit=True, reindex=True)
    except Exception as err:
        print('ERROR', hit.pid, err)

Permissions

RERO+ instance: disable all local entities permissions for all_permissions role AND pro_entity_manager. We don't use this feature.

Change SLM

v17.x --> v18.0

Update Mappings

poetry run invenio rero es index update-mapping

Libraries

index_name=`poetry run invenio rero es index info -i libraries`
echo $index_name
poetry run invenio rero es index move libraries $index_name libraries-library-v0.0.1-20230719
poetry run invenio rero es index update-mapping
poetry run invenio index delete $index_name

Alembic

  • poetry run invenio alembic upgrade e63e5dfa2416
  • poetry run invenio alembic upgrade 64a5cc96f96e
  • poetry run invenio alembic upgrade 8d97be2c8ad6

Process stats

from rero_ils.modules.stats.api import StatsForLibrarian, Stat
stat = Stat.get_record_by_pid('786')
compute = StatsForLibrarian()
compute.date_range = stat['date_range']
for val in stat.get('values', []):
    lib_pid = val['library']['pid']
    new_v_req = compute.validated_requests(lib_pid)
    print(lib_pid, val['validated_requests'],  new_v_req)
    val['validated_requests'] = new_v_req
stat.update(stat, commit=True, dbcommit=True, reindex=True)

Libraries

  • copy configuration for serial acquisition settings from acquisition settings and set default times for exception dates that are open and don't have times: the script below
from rero_ils.modules.libraries.api import Library, LibrariesSearch

print('Updating libraries acquisition settings and exception dates...')
libraries = LibrariesSearch()
print(f'Found {libraries.count()}')

errors = []
default_time_libs = []
time = {
            'start_time': '08:00',
            'end_time': '08:10'
        }

for hit in libraries.source().scan():
    lib = Library.get_record(hit.meta.id)
    default_time = False
    for date in lib.get('exception_dates', []):
        if date.get('is_open', False) and not date.get('times', []):
            default_time = True
            date['times'] = [time]
    if default_time:
        default_time_libs.append(lib.pid)

    try:
        if settings := lib.get('acquisition_settings'):
            lib['serial_acquisition_settings'] = settings
        lib.update(lib, True, True, True)
        print(f"Updating library, pid: {lib.get('pid')}.")
    except Exception as err:
        print(f"Error: {err} with lib pid: {lib.get('pid')}")
        errors.append(lib.pid)

print(f"Libraries updated with {len(errors)} errors.")
print(errors)

print(f'Default times set for exception open days in libraries: {default_time_libs}')

ILL Request

Write a script to add a default value for loan_status.

import click
from rero_ils.modules.ill_requests.api import ILLRequest, ILLRequestsSearch

click.secho('Updating ill_requests loan_status...')
ill_without_loan_status = ILLRequestsSearch().exclude('exists', field='loan_status')
click.secho(f'Found {ill_without_loan_status.count()} ill_requests without loan status')

status_mapping = {
    "pending": "PENDING",
    "validated": "ITEM_ON_LOAN",
    "denied": "PENDING",
    "closed": "ITEM_RETURNED"
    }

errors = []
for hit in ill_without_loan_status.source().scan():
    ill = ILLRequest.get_record(hit.meta.id)
    try:
        status = ill.get("status")
        ill['loan_status'] = status_mapping[status]
        ill.update(ill, True, True, True)
        click.secho(f"Updating ill_request, pid: {ill.get('pid')}.")
    except Exception as err:
        click.secho(f"Error: {err} with ill_requests pid: {ill.get('pid')}")
        errors.append(ill.pid)

click.secho(f"Ill_requests updated with {len(errors)} errors.")

for pid in errors:
    ILLRequest.get_record_by_pid(pid).delete(dbcommit=True, delindex=True)
    print(pid)

remove old scheduler tasks

from rero_ils.schedulers import current_scheduler
current_scheduler.remove('replace-idby-subjects-imported')
current_scheduler.remove('replace-idby-contribution')
current_scheduler.remove('replace-idby-subjects')

v16.x --> v17.0

Update entities

# update es mapping
poetry run invenio rero es index update-mapping

# rename contribution tables
poetry run invenio alembic upgrade a710021979fe

# move the contribution index into the entity index
poetry run invenio index create  -b rero_ils/modules/entities/mappings/v7/entities/entity-v0.0.1.json 'entities-entity-v0.0.1-20230516'

Aliases

from invenio_search import current_search_client
current_search_client.indices.put_alias('entities-entity-v0.0.1-20230516', 'entities')
current_search_client.indices.put_alias('entities-entity-v0.0.1-20230516', 'entities-entity-v0.0.1')

Fixes PID doc_type for entity (cont->ent)

from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
PersistentIdentifier.query.filter_by(pid_type='cont').update({'pid_type': 'ent'})
db.session.commit()

Note: remove the contributions es alias

Fixes entities jsonschema

from sqlalchemy import func
from rero_ils.modules.entities.api import Entity
from invenio_db import db
# Correct $schema for entities in db
old_schema = 'https://bib.rero.ch/schemas/contributions/contribution-v0.0.1.json'
schema = 'https://bib.rero.ch/schemas/entities/entity-v0.0.1.json'
count = Entity.model_cls.query.filter(Entity.model_cls.json['$schema'].as_string() == old_schema).update({"json": func.jsonb_set(Entity.model_cls.json, '{$schema}', f'"{schema}"')}, synchronize_session=False)

db.session.commit()
print(f"{count} updated entities")
poetry run invenio reroils index reindex -t ent
poetry run invenio reroils index run -c 7 -d

Template entity migration

poetry run /network/nfs/data_ils/ils/scripts/entities.py reroils utils correct-templates -c -l <change_this_log_file>

Verify template 2015 is correct. (No contribution with entities having only type!)

Document entity migration

index_name=`poetry run invenio rero es index info -i documents`
echo $index_name
poetry run invenio rero es index move -v documents $index_name documents-document-v0.0.1-20230531
poetry run invenio index delete $index_name
poetry run python  /network/nfs/data_ils/ils/scripts/entities.py reroils utils correct-documents -c -l <change_this_log_file>

Alternative approach on worker

from rero_invenio_base.modules.tasks import run_on_worker
from rero_invenio_base.modules.utils import chunk
n = 1
with open('/network/nfs/data_ils/ils/scripts/entities.py') as f:
    src = f.read()
    for c in chunk([str(val) for val in Document.get_all_ids()], int(Document.count()/14.9)):
        run_on_worker.delay(src, 'do_documents', ids=c, logfile=f'/network/nfs/data_ils/ils/logs/create-authorized-access-points-2023-05-28-{n}.log', commit=True)
        n += 1
        print(f'start {n}')

Delete old sync agents scheduler tasks and time stamps.

from rero_ils.schedulers import current_scheduler
current_scheduler.remove('sync-agents')

from invenio_cache import current_cache
data = current_cache.get('timestamps')
data.pop('sync_agents')
current_cache.set(key='timestamps', value=data, timeout=0)

Clean up

poetry run invenio index delete contributions-contribution-v0.0.1-20230116

v15.x --> v16.0

  1. update the es mapping : poetry run invenio rero es index update-mapping
  2. add new 'pro_statistics_manager' role using CLI : poetry run invenio roles create -d 'Professional: Statistics manager' pro_statistics_manager
  3. ill request: is_ill_pickup and ill_pickup_name (new fields) -> There is a script alembic.
# fixes contributions alias by hand using elastic chrome extensions
# Note: not necessary done in the previous script poetry run invenio alembic upgrade add75cbcad66
poetry run invenio alembic upgrade  e3eb396b39bb
  1. item operation history: change type of trigger to keyword (missing field definition). Reindex operation_logs.
from invenio_search import current_search_client
def reindex(source, destination):
    res = current_search_client.reindex(
        body=dict(
            source=dict(
                index=source
            ),
            dest=dict(
                index=destination,
                version_type='external_gte'
            )
        ),
        wait_for_completion=False
    )
    return res['task']

def index_in_new(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(new_index_name, body=body)
        tasks.append(reindex(index_name, new_index_name))
    return tasks

def remove_old(indices):
    for index_name in indices:
        current_search_client.indices.delete(index_name)
        print(f'{index_name} has been deleted')

def rename_to_old(indices):
    tasks = []
    body = {
        "settings": {
        "number_of_shards": "8",
        "number_of_replicas": "1",
        "max_result_window": "100000"
    }}
    for index_name in indices:
        print(index_name)
        new_index_name = f'{index_name}-new'
        current_search_client.indices.create(index_name, body=body)
        tasks.append(reindex(new_index_name, index_name))
    return tasks

# Execute one line after the other
# Get the list of the operation logs indices
indices = list(current_search_client.indices.get_alias('operation_logs').keys())
tasks = index_in_new(indices)
# Check for completion
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old(indices)
tasks = rename_to_old(indices)
[current_search_client.tasks.get(t).get('completed') for t in tasks]
remove_old([i+'-new' for i in indices])

v14.x --> v15.0

update ES mapping

poetry run invenio rero es index update-mapping

run alembic migration scripts :

poetry run invenio alembic upgrade

In case of error try

On the database: update alembic_version set version_num='eec683a446e5' where version_num='e655021de0de'; and update alembic_version set version_num='eec683a446e5' where version_num='8145a7cdef99';. This should be done first on a test server.

poetry run invenio alembic upgrade 5f0b086e4b82
poetry run invenio alembic upgrade 8145a7cdef99
poetry run invenio alembic stamp 8145a7cdef99

run command to assign permission to this new role using CLI command :

poetry run invenio reroils fixtures import_role_policies data/role_policies.json
poetry run invenio reroils fixtures import_system_role_policies data/system_role_policies.json

reindex the following indices in the new index (ES index side)

Easier with: https://github.com/rero/rero-invenio-base/pull/12 (poetry run pip install git+https://github.com/rerowep/rero-invenio-base.git@wep-es-tasks)

acq_orders-acq_order-v0.0.1

index_name=`poetry run invenio rero es index info -i acq_orders`
echo $index_name
poetry run invenio rero es index move acq_orders $index_name acq_orders-acq_order-v0.0.1-20230313
poetry run invenio index delete $index_name

items-item-v0.0.1

index_name=`poetry run invenio rero es index info -i items`
echo $index_name
poetry run invenio rero es index move items $index_name items-item-v0.0.1-20230313
poetry run invenio index delete $index_name

documents-document-v0.0.1

index_name=`poetry run invenio rero es index info -i documents`
echo $index_name
poetry run invenio rero es index move documents $index_name documents-document-v0.0.1-20230313
poetry run invenio index delete $index_name

patrons-patron-v0.0.1

index_name=`poetry run invenio rero es index info -i patrons`
echo $index_name
poetry run invenio rero es index move patrons $index_name patrons-patron-v0.0.1-20230313
poetry run invenio index delete $index_name

items (python reindexing serial and with requests)

Reindex Items with Requests

from rero_ils.modules.loans.api import LoansSearch
from rero_ils.modules.items.api import Item
from rero_ils.modules.loans.models import LoanState

states = [
    LoanState.PENDING,
    LoanState.ITEM_AT_DESK,
    LoanState.ITEM_IN_TRANSIT_FOR_PICKUP,
    LoanState.ITEM_IN_TRANSIT_TO_HOUSE
]
item_requested_pids = set([hit.item_pid.value for hit in LoansSearch().filter('terms', state=states).source('item_pid').scan()])
for pid in item_requested_pids:
    rec = Item.get_record_by_pid(pid)
    rec.reindex()

Reindex Issues:

from rero_invenio_base.modules.tasks import run_on_worker
from rero_invenio_base.modules.utils import chunk
from rero_ils.modules.items.api import ItemsSearch
code = '''
def reindex(_ids):
    from rero_ils.modules.items.api import Item
    n = 0
    errors = []
    for _id in _ids:
        try:
            doc = Item.get_record(_id)
            doc.reindex()
            n += 1
        except Exception as e:
            print('error', e)
            errors.append(_id)
    return (n, errors)
'''
def get_all_ids():
    search = ItemsSearch().filter('term', type='issue').source().scan()
    for hit in search:
        yield hit.meta.id

for c in chunk([str(val) for val in get_all_ids()], 500):
    run_on_worker.delay(code, 'reindex', c)
Clone this wiki locally