# # Copyright (C) 2020 Alessandro Languasco # import pandas as pd import numpy as np # open output fileanalysis fileanalysis= open("analysis.txt","w") # functions needed to create the normalised values def f(x): y = np.log(x) return y data=pd.read_csv(r'EK-diff.csv', sep=';') data.head(2) df = pd.DataFrame(data, columns= ['q','EKdiff']) # create a new column with normalised values df['EKdiffnorm'] = (df['EKdiff'] / f(df['q']) ) #print (df) #FINDING MAX AND MIN and where they are attained #M=df['EKdiffnorm'].max() #idxM=df['EKdiffnorm'].idxmax() #argM=df['q'].iloc[idxM] #m=df['EKdiffnorm'].min() #idxm=df['EKdiffnorm'].idxmin() #argm=df['q'].iloc[idxm] #print('minimal value on column (EK(q)-EK(q)^+)/log(q) =',m,'attained at q =',argm) #print('maximal value on column (EK(q)-EK(q)^+)/log(q) =',M,'attained at q =',argM) #FINDING MAX and second MAX; MIN and second MIN; and where they are attained print(' ***** Statistics for EK(q)-EK(q)^+ *****') print('name source fileanalysis = EK-diff.csv') fileanalysis.write(" ***** Statistics for (EK(q)-EK(q)^+)/log(q) ***** ") fileanalysis.write("\n") fileanalysis.write("name source fileanalysis =EK-diff.csv") fileanalysis.write("\n") print(' ***** MAX *****') maxima =df['EKdiffnorm'].nlargest(2) M=maxima.iloc[0] idxM=maxima.idxmax() argM=df['q'].iloc[idxM] M1=maxima.iloc[1] idxM1=maxima.idxmin() argM1=df['q'].iloc[idxM1] print('maximal value on column (EK(q)-EK(q)^+)/log(q) =',M,'attained at q =',argM) print('second maximal value on column (EK(q)-EK(q)^+)/log(q) =',M1,'attained at q =',argM1) fileanalysis.write("maximal value on column (EK(q)-EK(q)^+)/log(q) = ") fileanalysis.write(str(M)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argM)) fileanalysis.write("\n") fileanalysis.write("second maximal value on column (EK(q)-EK(q)^+)/log(q) = ") fileanalysis.write(str(M1)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argM1)) fileanalysis.write("\n") print(' ***** MIN *****') minima =df['EKdiffnorm'].nsmallest(2) m=minima.iloc[0] idxm=minima.idxmin() argm=df['q'].iloc[idxm] m1=minima.iloc[1] idxm1=minima.idxmax() argm1=df['q'].iloc[idxm1] print('minimal value on column (EK(q)-EK(q)^+)/log(q) =',m,'attained at q =',argm) print('second minimal value on column (EK(q)-EK(q)^+)/log(q) =',m1,'attained at q =',argm1) fileanalysis.write(" ***** MIN *****\n") fileanalysis.write("minimal value on column (EK(q)-EK(q)^+)/log(q) = ") fileanalysis.write(str(m)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argm)) fileanalysis.write("\n") fileanalysis.write("second minimal value on column (EK(q)-EK(q)^+)/log(q) = ") fileanalysis.write(str(m1)) fileanalysis.write(" attained at q = ") fileanalysis.write(str(argm1)) fileanalysis.write("\n") print(' ***** STATS (EK(q)-EK(q)^+)/log(q) *****') # number of values >0 for EKdiffnorm total=float(len(df)) greater0 = len(df['EKdiffnorm'].loc[df.EKdiffnorm > 0]) percent=float(greater0)/total print('number of (EK(q)-EK(q)^+)/log(q)>0 :', greater0,'percentage =', percent*100) fileanalysis.write(" ***** STATS (EK(q)-EK(q)^+)/log(q) *****\n") fileanalysis.write("number of (EK(q)-EK(q)^+)/log(q)>0 : ") fileanalysis.write(str(greater0)) fileanalysis.write(" percentage = ") fileanalysis.write(str(percent*100)) fileanalysis.write("\n") # number of values <0 for EKdiffnorm less0 = len(df['EKdiffnorm'].loc[df.EKdiffnorm < 0]) percent=float(less0)/total print('number of (EK(q)-EK(q)^+)/log(q)<0 :',less0,'percentage =', percent*100) fileanalysis.write(" ***** STATS (EK(q)-EK(q)^+)/log(q) *****\n") fileanalysis.write("number of (EK(q)-EK(q)^+)/log(q)<0 : ") fileanalysis.write(str(less0)) fileanalysis.write(" percentage = ") fileanalysis.write(str(percent*100)) fileanalysis.write("\n") print('total number (less0+greater0) =', less0+greater0) print('total number (number of rows) =', int(total)) fileanalysis.write("total number (less1+greater1) = ") fileanalysis.write(str(less0+greater0)) fileanalysis.write("\n") fileanalysis.write("total number (number of rows) = ") fileanalysis.write(str(int(total))) fileanalysis.write("\n") # number of EKdiffnorm > 0.6 greater2= len(df.loc[df.EKdiffnorm > 0.6]) if greater2 > 0 : print(' ***** Possible wrong datas *******') print('number of EKdiffnorm > 0.6 :',greater2,'percentage =', percent*100) # number of EKdiffnorm <- 0.6 greater2= len(df.loc[df.EKdiffnorm < -0.6]) if greater2 > 0 : print(' ***** Possible wrong datas *******') print('number of EKdiffnorm < -0.6 :',greater2,'percentage =', percent*100) print('******* MEAN, STANDARD DEVIATION, VARIANCE ********') #print(df.describe()) mean = df['EKdiffnorm'].mean() devst = df['EKdiffnorm'].std() variance = df.var()['EKdiffnorm'] print('mean (EKdiffnorm) = ',mean) print('standard deviation (EKdiffnorm) = ',devst) print('variance (EKdiffnorm) = ',variance) fileanalysis.write('******* MEAN, STANDARD DEVIATION, VARIANCE ********') fileanalysis.write("\n") fileanalysis.write('mean (EKdiffnorm) = ') fileanalysis.write(str(mean)) fileanalysis.write("\n") fileanalysis.write('standard deviation (EKdiffnorm) = ') fileanalysis.write(str(devst)) fileanalysis.write("\n") fileanalysis.write('variance (EKdiffnorm) = ') fileanalysis.write(str(variance)) fileanalysis.write("\n") print('*******') print('********* plotting histogram and normal curves *********') import matplotlib.pyplot as plt from scipy.stats import norm df = df.astype(float) mean = mean.astype(float) devst = devst.astype(float) num_bins = 100 plt.hist(df['EKdiffnorm'], num_bins,color = "skyblue", histtype='bar', ec='black') # Plot the Probability Density Function. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) bin_width = (x.max() - x.min()) / num_bins mass_correction = total * bin_width print('******** black normal function data ******* ') mass_correction_black = mass_correction devst_black = 1.1*devst print('total data ', total) print('max x = ', x.max()) print('min x = ', x.min()) print('bin_width = ', bin_width) print('num bins = ', num_bins) print('mean = ', mean) print('standard deviation = ', devst_black) print('mass correction = ', mass_correction) p = mass_correction_black * norm.pdf(x, mean, devst_black) #p = 10400*norm.pdf(x, mean, devst) plt.plot(x, p, 'k', linewidth=1) print('******** red normal function data ******* ') #devst_red = 0.775*devst #mass_correction_red = mass_correction*0.775 devst_red = 0.65*devst mass_correction_red = mass_correction*0.78 print('total data ', total) print('max x = ', x.max()) print('min x = ', x.min()) print('bin_width = ', bin_width) print('num bins = ', num_bins) print('mean = ', mean) print('standard deviation = ', devst_red) print('mass correction = ', mass_correction_red) p = mass_correction_red * norm.pdf(x, mean, devst_red) #p = 7300*norm.pdf(x, mean, 0.7*devst) plt.plot(x, p, 'r', linewidth=1) #title = "$(\mathfrak{G}_{q}-\mathfrak{G}_{q}^+)/\log q$: $\mu =$ %.2f, $\sigma =$ %.2f, $\mathcal{M} =$%.2f" % (mean, devst, mass_correction) title = "$(\mathfrak{G}_{q}-\mathfrak{G}_{q}^+)/\log\ q$: histogram" plt.title(title) #plt.show() caption = "black curve : $\mathcal{M}\cdot \mathcal{N}(x,\mu,1.1\cdot\sigma)$ - red curve : $0.78\cdot\mathcal{M}\cdot \mathcal{N}(x,\mu,0.65\cdot\sigma)$" plt.xlabel(caption) #plt.ylabel(caption) plt.savefig("/Users/languasc/Documents/LANGUASCO/matematica/lavori/[58]Moree-kummer-ratio/histograms/histo-EKdiffnorm.png", dpi=600, bbox_inches='tight') # clear the plot plt.cla() # scatter plot print('********* saved in histo-EKdiffnorm.png *********') # scatter plot print('********') print('******** Print scatter EKdiffnorm ******* ') #setting dimentions plt.figure(figsize=(11, 8)) # setting ticks million=1000000 # parameter ax = plt.axes() ax.set_xticks([0,million,2*million, 3*million, 4*million, 5*million, 6*million, 7*million, 8*million , 9*million, 10*million ]) ax.set_xticklabels(['$0$', '$10^6$','$2\cdot 10^6$','$3\cdot 10^6$','$4\cdot 10^6$','$5\cdot 10^6$','$6\cdot 10^6$','$7\cdot 10^6$','$8\cdot 10^6$','$9\cdot 10^6$','$10^7$']) ax.set_yticks([-0.6,-0.5,-0.4,-0.3,-0.2, -0.1, 0, 0.1, 0.2,0.3, 0.4, 0.5, 0.6]) ax.set_yticklabels(['$-0.6$', '$-0.5$','$-0.4$','$-0.3$','$-0.2$', '$-0.1$', '$0$', '$0.1$', '$0.2$', '$0.3$', '$0.4$', '$0.5$', '$0.6$']) # setting bounds plt.xlim(0,10*million) # building the grid - horizontal lines count=7 for n in range(count): ax.axhline(y=-0.1*n, linewidth = 1, color='k', linestyle='dotted') ax.axhline(y=0.1*n, linewidth = 1, color='k', linestyle='dotted') ## red line in y=0 ax.axhline(y=0.0, linewidth = 1, color='r', linestyle='dotted') # building the grid - vertical lines count=10 for n in range(count): ax.axvline(x=million*n, linewidth = 1, color='k', linestyle='dotted') # plotting the scatter plot plt.scatter(df['q'], df['EKdiffnorm'], s=1, c = 'black', marker = '.') # plotting max and min maxekdiffnorm = df['EKdiffnorm'].max() minekdiffnorm = df['EKdiffnorm'].min() plt.plot(argM, maxekdiffnorm, marker='o', markersize=4, color="red") plt.plot(argm, minekdiffnorm, marker='o', markersize=4, color="blue") # inserting a title and a caption title = "$(\mathfrak{G}_{q}-\mathfrak{G}_{q}^+)/\log\ q$: scatter plot" plt.title(title) caption = "" plt.xlabel(caption) # save plot on file plt.savefig("/Users/languasc/Documents/LANGUASCO/matematica/lavori/[58]Moree-kummer-ratio/plots/EKdiffnorm-scat.png", dpi=600, bbox_inches='tight', orientation='landscape') # clear plot plt.cla() print('********* saved in EKdiffnorm-scat.png *********') print('*******') print(' ***** End PYTHON script *****') fileanalysis.close() ''' print(' ***** Exported data frame for gnuplot *****') print(' ***** in fileanalysis named: dati-totali-per-grafico.txt *****') df.to_csv (r'dati-totali-per-grafico.txt', sep='\t', index = False, header=True) print(' ***** Exported data frame for gnuplot *****') print(' ***** in fileanalysis named: diff.dat *****') df.to_csv (r'diffnorm.dat', sep='\t', index = False, header=True) '''