Sirikata: libcore/include/sirikata/core/transfer/URL.hpp Source File

Sirikata
00001 /*  Sirikata Transfer -- Content Transfer management system
00002  *  URL.hpp
00003  *
00004  *  Copyright (c) 2008, Patrick Reiter Horn
00005  *  All rights reserved.
00006  *
00007  *  Redistribution and use in source and binary forms, with or without
00008  *  modification, are permitted provided that the following conditions are
00009  *  met:
00010  *  * Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer.
00012  *  * Redistributions in binary form must reproduce the above copyright
00013  *    notice, this list of conditions and the following disclaimer in
00014  *    the documentation and/or other materials provided with the
00015  *    distribution.
00016  *  * Neither the name of Sirikata nor the names of its contributors may
00017  *    be used to endorse or promote products derived from this software
00018  *    without specific prior written permission.
00019  *
00020  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
00021  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
00022  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
00023  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
00024  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00025  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00026  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00027  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00028  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00029  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00030  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031  */
00032 /*  Created on: Jan 5, 2009 */
00033 
00034 #ifndef SIRIKATA_URL_HPP__
00035 #define SIRIKATA_URL_HPP__
00036 
00037 #include <sirikata/core/util/Platform.hpp>
00038 #include <sirikata/core/transfer/URI.hpp>
00039 
00040 #ifdef _WIN32
00041 #include <locale>
00042 #endif
00043 
00044 namespace Sirikata {
00045 namespace Transfer {
00046 
00055 class URLContext {
00056     friend class URL;
00057 
00058     std::string mProto;
00059     std::string mHost;
00060     std::string mUser;
00061     std::string mDirectory; 
00062 //  AuthenticationCreds mAuth;
00063 
00065     static inline void resolveParentDirectories(std::string &str) {
00066         // Do nothing for now.
00067         /*
00068         std::string::size_type slashpos = 0;
00069         while (true) {
00070             slashpos = str.find('/', slashpos);
00071             if (slashpos != std::string::npos) {
00072                 std::string dir = str.substr(slashpos, slashpos)
00073 
00074             }
00075             slashpos++;
00076         }
00077         */
00078         /*
00079         while (str[0]=='.'&&str[1]=='.'&&str[2]=='/') {
00080             str = str.substr(3);
00081         }
00082         std::string::size_type nextdotdot = 0;
00083         while ((nextdotdot = str.find("../", nextdotdot+1)) != std::string::npos) {
00084 
00085         }
00086         */
00087     }
00088 
00089     struct IsSpace {
00090         inline bool operator()(const unsigned char c) {
00091             int kspace=(char)c;
00092             return std::isspace(kspace
00093 #ifdef _WIN32
00094                 ,std::locale()
00095 #endif
00096                 )!=false;
00097         }
00098     };
00099 
00100     void cleanup(std::string &s) {
00101         // hostnames and protocols are case-insensitive.
00102         for (std::string::size_type i = 0; i < s.length(); ++i) {
00103             s[i] = std::tolower(s[i]
00104 #ifdef _WIN32
00105                 ,std::locale()
00106 #endif
00107 
00108             );
00109         }
00110         // remove any illegal characters such as spaces.
00111         s.erase(std::remove_if(s.begin(), s.end(), IsSpace()), s.end());
00112     }
00113 
00114 public:
00116     URLContext() {
00117     }
00118 
00120     URLContext(const std::string &newProto,
00121             const std::string &newHost,
00122             const std::string &newUser,
00123             const std::string &newDirectory)
00124         : mProto(newProto),
00125           mHost(newHost),
00126           mUser(newUser),
00127           mDirectory(newDirectory){
00128 
00129         cleanup(mProto);
00130         cleanup(mHost);
00131     }
00132 
00136     URLContext(const URLContext &parent,
00137             const std::string *newProto,
00138             const std::string *newHost,
00139             const std::string *newUser,
00140             const std::string *newDirectory)
00141         : mProto(newProto?*newProto:parent.proto()),
00142           mHost(newHost?*newHost:parent.host()),
00143           mUser(newUser?*newUser:parent.username()),
00144           mDirectory(newDirectory?*newDirectory:parent.basepath()){
00145         cleanup(mProto);
00146         cleanup(mHost);
00147     }
00148 
00152     URLContext(const URLContext &parent, const std::string &identifier)
00153             : mProto(parent.proto()),
00154               mHost(parent.host()),
00155               mUser(parent.username()),
00156               mDirectory(parent.basepath()) {
00157         parse(identifier);
00158     }
00159 
00166     URLContext(const std::string &identifier) {
00167         parse(identifier);
00168     }
00169 private:
00170     void parse(std::string identifier) {
00171         if (!identifier.empty() && identifier[identifier.length()-1] != '/') {
00172             identifier += '/';
00173         }
00174         std::string::size_type colonpos = identifier.find(':');
00175         std::string::size_type firstslashpos = identifier.find('/');
00176         std::string::size_type startpos = 0;
00177         if (colonpos != std::string::npos &&
00178                 (firstslashpos == std::string::npos || colonpos < firstslashpos)) {
00179             /* FIXME: Only accept [a-z0-9] as part of the protocol. We don't want an IPv6 address or
00180               long filename with a colon in it being misinterpreted as a protocol.
00181               Also, port numbers will be preceded by a colon */
00182 
00183             // global path
00184             mProto = identifier.substr(0, colonpos);
00185             mUser = std::string();
00186             startpos = colonpos+1;
00187         }
00188 
00189         if (identifier.length() > startpos+2 && identifier[startpos]=='/' && identifier[startpos+1]=='/') {
00190             /* FIXME: IPv6 addresses have colons and are surrounded by braces.
00191              * Example: "http://someuser@[2001:5c0:1101:4300::1]:8080/somedir/somefile*/
00192 
00193             // protocol-relative path
00194             mUser = std::string(); // clear out user (set to blank if unspecified)
00195             //mHost = std::string(); // we actually keep this the same.
00196 
00197             std::string::size_type atpos, slashpos;
00198             std::string::size_type beginpos = startpos+2;
00199             slashpos = identifier.find('/',beginpos);
00200             atpos = identifier.rfind('@', slashpos);
00201             // Authenticated URL
00202             if (atpos != std::string::npos && atpos >= beginpos && atpos < slashpos) {
00203                 mUser = identifier.substr(beginpos, atpos-beginpos);
00204                 beginpos = atpos+1;
00205             }
00206             if (slashpos != beginpos) {
00207                 if (slashpos == std::string::npos) {
00208                     mHost = identifier.substr(beginpos);
00209                 } else {
00210                     mHost = identifier.substr(beginpos, slashpos-beginpos);
00211                 }
00212             } else {
00213                 mHost = std::string();
00214             }
00215             startpos = slashpos;
00216         }
00217         if (identifier.length() > startpos && identifier[startpos]=='/') {
00218             // server-relative path
00219             std::string::size_type lastdir = identifier.rfind('/');
00220             if (lastdir > startpos) {
00221                 mDirectory = identifier.substr(startpos+1, lastdir-startpos-1);
00222             } else {
00223                 mDirectory = std::string();
00224             }
00225         } else {
00226             // directory-relative path -- not implemented here
00227             std::string::size_type lastdir = identifier.rfind('/');
00228             if (lastdir != std::string::npos && lastdir > startpos) {
00229                 if (mDirectory.empty()) {
00230                     mDirectory = identifier.substr(startpos, lastdir-startpos);
00231                 } else {
00232                     mDirectory += '/' + identifier.substr(startpos, lastdir-startpos);
00233                 }
00234             }
00235         }
00236 
00237         resolveParentDirectories(mDirectory);
00238 
00239         cleanup(mProto);
00240         cleanup(mHost);
00241     }
00242 
00243 public:
00245     inline const std::string &proto() const {
00246         return mProto;
00247     }
00248 
00249     inline void setProto(const std::string &proto) {
00250         mProto = proto;
00251     }
00252 
00257     inline const std::string &username() const {
00258         return mUser;
00259     }
00260 
00261     inline void setUsername(const std::string &user) {
00262         mUser = user;
00263     }
00264 
00271     inline const std::string &host() const {
00272         return mHost;
00273     }
00274 
00275     /*
00276      * hostname getter -- without port number
00277      */
00278     inline const std::string hostname() const {
00279         std::string::size_type colonPos = mHost.find(":");
00280         if (colonPos == std::string::npos) {
00281             return mHost;
00282         } else {
00283             return mHost.substr(0, colonPos);
00284         }
00285     }
00286 
00287     /*
00288      * service getter -- returns the "port" of a URL, or empty string
00289      *                   if there is none
00290      * Example:
00291      * http://www.example.com/path -> ""
00292      * meerkat://hostname.edu:8080/path -> "8080"
00293      */
00294     inline const std::string service() const {
00295         std::string::size_type colonPos = mHost.find(":");
00296         if (colonPos == std::string::npos) {
00297             return "";
00298         } else {
00299             return mHost.substr(colonPos+1);
00300         }
00301     }
00302 
00303 
00304     inline void setHost(const std::string &host) {
00305         mHost = host;
00306     }
00307 
00308     /* The directory, excluding both initial slash and ending slash.
00309      * Often, this may just be the empty string, but depends on protocol. */
00310     inline const std::string &basepath() const {
00311         return mDirectory;
00312     }
00313 
00314     inline void setBasepath(const std::string &basepath) {
00315         mDirectory = basepath;
00316     }
00317 
00318     void toParentPath(std::string *pathString) {
00319         std::string::size_type slash = mDirectory.rfind('/');
00320         if (slash == std::string::npos) {
00321             if (pathString) {
00322                 if (!mDirectory.empty()) {
00323                     *pathString = mDirectory + "/" + *pathString;
00324                 }
00325             }
00326             mDirectory = std::string();
00327         } else {
00328             if (pathString) {
00329                 if (slash > 0) {
00330                     *pathString = mDirectory.substr(slash+1) + "/" + *pathString;
00331                 }
00332             }
00333             mDirectory = mDirectory.substr(0, slash);
00334         }
00335     }
00336 
00337     void toParentContext(std::string *pathString) {
00338         if (mDirectory.empty()) {
00339             if (mHost.empty()) {
00340                 if (pathString) {
00341                     *pathString = mProto + ":" + *pathString;
00342                 }
00343                 mProto = std::string();
00344             } else {
00345                 if (pathString) {
00346                     *pathString = "//" + (mUser.empty()?"":mUser+"@") + mHost + "/" + *pathString;
00347                 }
00348                 mHost = std::string();
00349             }
00350             mUser = std::string();
00351         } else {
00352             toParentPath(pathString);
00353         }
00354     }
00355 /*
00356     void relocate(const URLContext &source, const URLContext &dest) {
00357         std::string
00358     }
00359 */
00360 
00362     inline std::string toString(bool trailingSlash=true) const {
00363         std::string ret (mProto + "://" + (mUser.empty() ? "" : (mUser + "@")) + mHost);
00364         if (!mDirectory.empty()) {
00365             ret += ("/" + mDirectory);
00366         }
00367         if (trailingSlash) {
00368             return ret + '/';
00369         }
00370         return ret;
00371     }
00372 
00374     inline bool operator< (const URLContext &other) const {
00375         /* Note: I am testing user before hostname to keep this ordering
00376          * the same as if you used a string version of the URL.
00377          */
00378         if (mProto == other.mProto) {
00379             if (mUser == other.mUser) {
00380                 if (mHost == other.mHost) {
00381                     return mDirectory < other.mDirectory;
00382                 }
00383                 return mHost < other.mHost;
00384             }
00385             return mUser < other.mUser;
00386         }
00387         return mProto < other.mProto;
00388     }
00389 
00391     inline bool operator==(const URLContext &other) const {
00392         return mDirectory == other.mDirectory &&
00393             mUser == other.mUser &&
00394             mHost == other.mHost &&
00395             mProto == other.mProto;
00396     }
00397     inline bool operator!=(const URLContext &other) const {
00398         // We can ignore the hash if it references the same URL.
00399         return !((*this) == other);
00400     }
00401 
00402     bool empty() const {
00403         return
00404             mDirectory.empty() &&
00405             mUser.empty() &&
00406             mHost.empty() &&
00407             mProto.empty();
00408     }
00409 
00410     operator bool() const {
00411         return !empty();
00412     }
00413 };
00414 
00416 inline std::ostream &operator<<(std::ostream &str, const URLContext &urlctx) {
00417     return str << urlctx.toString();
00418 }
00419 
00421 class URL {
00422     URLContext mContext;
00423     std::string mPath; // should have no slashes.
00424 
00425     void findSlash(const std::string &url) {
00426         std::string::size_type slash = url.rfind('/');
00427         if (slash != std::string::npos) {
00428             // FIXME: handle incomplete URLs correctly
00429             if (slash > 0 && url[slash-1] == '/' && !(slash > 1 && url[slash-2] == '/')) {
00430                 // this is actually a hostname section... don't copy it into the filename.
00431                 // unless there were three slashes in a row.
00432                 mPath = std::string();
00433                 mContext.parse(url);
00434             } else {
00435                 mPath = url.substr(slash+1);
00436                 mContext.parse(url.substr(0, slash+1));
00437             }
00438         } else {
00439             std::string::size_type colon = url.find(':');
00440             if (colon != std::string::npos) {
00441                 mPath = url.substr(colon+1);
00442                 mContext.parse(url.substr(0, colon+1));
00443             } else {
00444                 mPath = url;
00445             }
00446         }
00447     }
00448 public:
00450     explicit URL() {
00451     }
00452 
00459     URL(const URLContext &parentContext, const std::string &url)
00460             : mContext(parentContext) {
00461         findSlash(url);
00462     }
00463 
00469     explicit URL(const char *url) {
00470         findSlash(url);
00471     }
00472 
00478     explicit URL(const std::string &url) {
00479         findSlash(url);
00480     }
00481 
00482 
00484     explicit URL(const URI& uri) {
00485         findSlash(uri.toString());
00486     }
00487 
00491     inline const URLContext &context() const {
00492         return mContext;
00493     }
00494 
00496     inline URLContext &getContext() {
00497         return mContext;
00498     }
00499 
00501     inline const std::string &proto() const {
00502         return mContext.proto();
00503     }
00504 
00506     inline const std::string &host() const {
00507         return mContext.host();
00508     }
00509 
00511     inline const std::string hostname() const {
00512         return mContext.hostname();
00513     }
00514 
00516     inline const std::string &username() const {
00517         return mContext.username();
00518     }
00519 
00521     inline const std::string &basepath() const {
00522         return mContext.basepath();
00523     }
00524 
00526     inline const std::string &filename() const {
00527         return mPath;
00528     }
00529 
00530     inline void setFilename(const std::string &file) {
00531         mPath = file;
00532     }
00533 
00538     inline std::string fullpath() const {
00539         if (mContext.basepath().empty()) {
00540             return '/' + mPath;
00541         } else {
00542             return '/' + mContext.basepath() + '/' + mPath;
00543         }
00544     }
00545 /*
00546     void relocate(const URLContext &source, const URLContext &dest) {
00547         getContext().relocate(source, dest);
00548     }
00549 */
00553     inline std::string toString () const {
00554         return mContext.toString() + mPath;
00555     }
00556 
00558     inline bool operator<(const URL &other) const {
00559         // We can ignore the hash if it references the same URL.
00560         if (mContext == other.mContext) {
00561             return mPath < other.mPath;
00562         }
00563         return mContext < other.mContext;
00564     }
00565 
00567     inline bool operator==(const URL &other) const {
00568         // We can ignore the hash if it references the same URL.
00569         return mPath == other.mPath && mContext == other.mContext;
00570     }
00571     inline bool operator!=(const URL &other) const {
00572         // We can ignore the hash if it references the same URL.
00573         return !((*this) == other);
00574     }
00575 
00576     bool empty() const {
00577         return mContext.empty() && mPath.empty();
00578     }
00579 
00580     operator bool() const {
00581         return !empty();
00582     }
00583 
00584     struct Hasher {
00585         size_t operator() (const URL& url)const {
00586             return std::tr1::hash<std::string>()(url.mPath);
00587         }
00588     };
00589 };
00590 
00592 inline std::ostream &operator<<(std::ostream &str, const URL &url) {
00593     return str << url.toString();
00594 }
00595 
00596 }
00597 }
00598 
00599 #endif /* SIRIKATA_URL_HPP__ */