Rodrigo Loredo rloredo

rloredo / accrual_v1.py

Last active December 11, 2021 12:48

Revenue accrual

	import pandas as pd
	from pandas.tseries.offsets import MonthEnd

	def assign_to_months(start_date:pd.Timestamp, end_date:pd.Timestamp, total:float, year:int) -> pd.Series:
	"""
	start_date: start date of the contract
	end_date: end date of the contract
	total: total amount of the contract
	year: year to be accrued, everything outside this year will not be accrued
	"""

rloredo / poisson_timeseries_generator.py

Created September 26, 2021 18:29

Generate time series with poisson distribution

	import math
	import random
	from datetime import datetime, timedelta

	def poisson_timeseries_generator(tau, start_time, end_time):
	"""
	Generates a time series with a poisson distribution

	tau: interval lenght i.e. mean of the expected time between events
	start_time: starting date of the time series. %Y-%m-%d %H:%M:%S format

rloredo / stanza.py

Last active December 9, 2021 13:11

Use Stanza library to tokenize and lemmatize texts

	import stanza
	import pandas as pd

	#Load a dataframe with text in one column
	df = pd.DataFrame({'label':[1], 'text' : ['Hi Juan Carlos'] })

	#Initialize the engine. In this case in Portuguese
	nlp_pt = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')

	#Tokenize, lemmatize and POS

rloredo / LDA_sklearn.py

Created September 26, 2021 11:54

LDA topic modelling with sklearn and visualization with pyLDAvis

	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer
	#For cvectorizer
	def do_nothing(x):
	return x

	#Create CV matrix
	#Use max_df to delete words that appears in more than x.x% of documents (float is %)
	#Use min_df to delete words that appears in less than x documents (int is x)
	#Use ngram_range to create ngrams and use them as extra features

rloredo / gensim_doc2vec.py

Last active September 26, 2021 11:50

How to use gensim doc2vec models

	import gensim

	#split train/test if necessary
	end = -500
	#docs is a pd.Series with lists of tokens representing each document
	#don't forget to normalize tokens (to lower, strip accents, etc)
	train = [gensim.models.doc2vec.TaggedDocument(d, [i]) for i, d in enumerate(docs.values[:end])]
	test = docs.values[end:]

	#doc2vec needs tagged docs

rloredo / cosine_similarity.js

Last active September 26, 2021 11:51

Calculate cosine similarity in javascript