This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pandas.tseries.offsets import MonthEnd | |
def assign_to_months(start_date:pd.Timestamp, end_date:pd.Timestamp, total:float, year:int) -> pd.Series: | |
""" | |
start_date: start date of the contract | |
end_date: end date of the contract | |
total: total amount of the contract | |
year: year to be accrued, everything outside this year will not be accrued | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import random | |
from datetime import datetime, timedelta | |
def poisson_timeseries_generator(tau, start_time, end_time): | |
""" | |
Generates a time series with a poisson distribution | |
tau: interval lenght i.e. mean of the expected time between events | |
start_time: starting date of the time series. %Y-%m-%d %H:%M:%S format |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import stanza | |
import pandas as pd | |
#Load a dataframe with text in one column | |
df = pd.DataFrame({'label':[1], 'text' : ['Hi Juan Carlos'] }) | |
#Initialize the engine. In this case in Portuguese | |
nlp_pt = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma') | |
#Tokenize, lemmatize and POS |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.feature_extraction.text import CountVectorizer | |
#For cvectorizer | |
def do_nothing(x): | |
return x | |
#Create CV matrix | |
#Use max_df to delete words that appears in more than x.x% of documents (float is %) | |
#Use min_df to delete words that appears in less than x documents (int is x) | |
#Use ngram_range to create ngrams and use them as extra features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim | |
#split train/test if necessary | |
end = -500 | |
#docs is a pd.Series with lists of tokens representing each document | |
#don't forget to normalize tokens (to lower, strip accents, etc) | |
train = [gensim.models.doc2vec.TaggedDocument(d, [i]) for i, d in enumerate(docs.values[:end])] | |
test = docs.values[end:] | |
#doc2vec needs tagged docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function cosinesim(A,B){ | |
var dotproduct=0; | |
var mA=0; | |
var mB=0; | |
for(i = 0; i < A.length; i++){ | |
dotproduct += (A[i] * B[i]); | |
mA += (A[i]*A[i]); | |
mB += (B[i]*B[i]); | |
} | |
mA = Math.sqrt(mA); |