maandag 16 december 2019

Voorbeeld Hoe genereer je een dataset uit naar jaar obv begin en einddatum

Voorbeeld 1


# -*- coding: utf-8 -*-
"""
Created on Mon Dec 16 10:19:04 2019

@author: wagene002
"""

#https://stackoverflow.com/questions/43832484/expanding-a-dataframe-based-on-start-and-end-columns-speed
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime as dt


df = pd.DataFrame()
df['BEGIN_DATUM'] = [dt.datetime(2010, 4, 22), dt.datetime(2013, 4, 5)]
df['EIND_DATUM'] = [dt.datetime(2011, 4, 10), dt.datetime(2016, 12, 12)]
df['country'] = ['US', 'EU']
df['letter'] = ['a', 'b']

cols = list(df.columns)

#bepaal per regel het data id
df['data_id'] = np.arange(0, len(df))
df['BEGIN_JAAR']=df['BEGIN_DATUM'].map(lambda x:x.year)
df['EIND_JAAR']=df['EIND_DATUM'].map(lambda x:x.year)

#stop data in series
data_id = df['data_id']
start1 = df['BEGIN_JAAR']
end1 = df['EIND_JAAR']

#diff = ((end-start) / np.timedelta64(1, 'Y')).astype('int') + 1
#bepaal hoeveel jaren er voorkomen in een regel
diff = (end1-start1) + 1
#genereer het data_id uit per aantal gevonden jaren
repeated_id = np.repeat(data_id, diff)

time_df = pd.DataFrame(data={'data_id': repeated_id})
time_df = pd.merge(left=time_df, right=df[['data_id', 'BEGIN_JAAR','EIND_JAAR']], on=['data_id'])


time_df['year_id'] = np.arange(0, len(time_df))

min_year_id = time_df.groupby('data_id')['year_id'].min().reset_index().rename(columns={'year_id': 'min_year_id'})
time_df = pd.merge(left=time_df, right=min_year_id, on=['data_id'])
#years_to_add = (time_df['year_id'] - time_df['min_year_id']) * np.timedelta64(1, 'Y')
years_to_add = (time_df['year_id'] - time_df['min_year_id'])
time_df['JAARTAL'] = time_df['BEGIN_JAAR'] + years_to_add

#time_df = time_df[time_df['JAARTAL'].dt.dayofweek < 5]

dfuit = pd.merge(left=df, right=time_df[['data_id', 'JAARTAL']], on=['data_id'])
dfuit = dfuit[['JAARTAL']+cols]




ander werkend voorbeeld


# -*- coding: utf-8 -*-
"""
Created on Tue Dec 10 10:40:15 2019

@author: wagene002
"""

import numpy as np
import pandas as pd
from datetime import timedelta


xls_file=pd.ExcelFile(r'O:\Input\03_ZichtenGrip\LLV\rechten\__5__Grip op sociaal Recht.xlsx')

dfin=xls_file.parse(0,skiprows=0,dtype=str)

dfin=dfin.astype({'VD_Ingangsdatum': 'datetime64', 'VD_Einddatum': 'datetime64'})
dictVeldnamen={'Valid_BSN':'VALID_BSN','LR_BSN': 'BSN', 'VD_Ingangsdatum': 'BEGIN_DATUM','VD_Einddatum':'EIND_DATUM','Code_Bron': 'CODE_BRON'}
dfin.rename(columns=dictVeldnamen,inplace='true')
#Dit veld is niet goed. Deze gaan we juist berekenen
dfin.drop('JAAR', axis=1,inplace=True)
dfin.drop_duplicates(subset=None, keep='last', inplace=True)


dfin['VALID_BSN'].fillna('0',inplace=True)
dfin['BSN'].fillna('',inplace=True)

####dfin=dfin[dfin['BSN']=='233749172']

cols = list(dfin.columns)

#bepaal per regel het data id
dfin['data_id'] = np.arange(0, len(dfin))
dfin['BEGIN_JAAR']=dfin['BEGIN_DATUM'].map(lambda x:x.year)
dfin['EIND_JAAR']=dfin['EIND_DATUM'].map(lambda x:x.year)


#stop data in series
data_id = dfin['data_id']
start = dfin['BEGIN_JAAR']
end = dfin['EIND_JAAR']

#diff = ((end-start) / np.timedelta64(1, 'Y')).astype('int') + 1
#bepaal hoeveel jaren er voorkomen in een regel
diff = (end-start)  + 1


#genereer het data_id uit per aantal gevonden jaren
repeated_id = np.repeat(data_id, diff)
time_df = pd.DataFrame(data={'data_id': repeated_id})
time_df = pd.merge(left=time_df, right=dfin[['data_id', 'BEGIN_JAAR','EIND_JAAR']], on=['data_id'])


time_df['year_id'] = np.arange(0, len(time_df))

min_year_id = time_df.groupby('data_id')['year_id'].min().reset_index().rename(columns={'year_id': 'min_year_id'})
time_df = pd.merge(left=time_df, right=min_year_id, on=['data_id'])
#years_to_add = (time_df['year_id'] - time_df['min_year_id']) * np.timedelta64(1, 'Y')
years_to_add = (time_df['year_id'] - time_df['min_year_id'])
time_df['JAAR'] = time_df['BEGIN_JAAR'] + years_to_add

#time_df = time_df[time_df['JAARTAL'].dt.dayofweek < 5]

dfuit = pd.merge(left=dfin, right=time_df[['data_id', 'JAAR']], on=['data_id'])
dfuit = dfuit[['JAAR']+cols]
dfuit.to_csv(r'O:\Output\01.Overstijgend\01_ZichtenGrip\LLV\rechten\RECHT_LLV.csv', sep=';')


















Geen opmerkingen:

Een reactie posten